Scrapy is great for large-scale scraping, but it falls apart when sites throw CAPTCHAs at you mid-crawl. Your spider shouldn't need to know about CAPTCHAs — that's infrastructure, not business logic.
Here's how I built a Scrapy downloader middleware that handles CAPTCHAs transparently.
The Problem
You're scraping a product catalog. After 50 requests, the site starts returning CAPTCHA pages instead of product data. Your spider sees garbage HTML, your items pipeline gets confused, and your crawl grinds to a halt.
The naive fix is adding CAPTCHA handling inside every spider:
# Don't do this — CAPTCHA logic pollutes every spider
class ProductSpider(scrapy.Spider):
def parse(self, response):
if "captcha" in response.text:
# solve captcha...
# retry request...
# handle errors...
pass
else:
yield {"title": response.css("h1::text").get()}
This doesn't scale. Every spider needs the same boilerplate. The right approach: a middleware.
The Middleware Architecture
Scrapy's downloader middleware sits between the engine and the downloader. It can intercept responses before they reach your spider:
Spider → Engine → [Middleware] → Downloader → Website
↑
Intercept CAPTCHA responses here
Solve and retry automatically
Building the Middleware
# captcha_middleware.py
import logging
from scrapy import Request
from scrapy.http import HtmlResponse
logger = logging.getLogger(__name__)
class CaptchaMiddleware:
"""
Scrapy downloader middleware that automatically detects
and solves CAPTCHAs, then retries the original request.
"""
CAPTCHA_INDICATORS = [
"recaptcha",
"g-recaptcha",
"hcaptcha",
"h-captcha",
"cf-turnstile",
"challenges.cloudflare.com",
"captcha-delivery",
]
def __init__(self, api_key, max_retries=3):
self.api_key = api_key
self.max_retries = max_retries
self.stats = {"detected": 0, "solved": 0, "failed": 0}
@classmethod def from_crawler(cls, crawler):
return cls(
api_key=crawler.settings.get("CAPTCHA_API_KEY"),
max_retries=crawler.settings.getint("CAPTCHA_MAX_RETRIES", 3),
)
def process_response(self, request, response, spider):
# Skip if not HTML
if not isinstance(response, HtmlResponse):
return response
# Skip if already retrying
retries = request.meta.get("captcha_retries", 0)
if retries >= self.max_retries:
logger.warning(f"Max CAPTCHA retries reached for {request.url}")
self.stats["failed"] += 1
return response
# Detect CAPTCHA
if not self._has_captcha(response):
return response
self.stats["detected"] += 1
logger.info(f"CAPTCHA detected on {request.url} (attempt {retries + 1})")
# Solve it
token = self._solve_captcha(response)
if not token:
self.stats["failed"] += 1
return response
self.stats["solved"] += 1
# Retry with token
new_request = request.copy()
new_request.meta["captcha_retries"] = retries + 1
new_request.meta["captcha_token"] = token
new_request.dont_filter = True
# Add token to form data or headers
if request.method == "POST":
new_request = request.replace(
body=self._inject_token(request.body, token, response)
)
else:
# For GET requests, add as cookie or query param
new_request.cookies["cf-turnstile-response"] = token
return new_request
def _has_captcha(self, response):
text = response.text.lower()
return any(ind in text for ind in self.CAPTCHA_INDICATORS)
def _detect_type(self, response):
text = response.text.lower()
if "recaptcha" in text or "g-recaptcha" in text:
return "recaptcha_v2"
elif "hcaptcha" in text or "h-captcha" in text:
return "hcaptcha"
elif "cf-turnstile" in text:
return "turnstile"
return "recaptcha_v2" # default fallback
def _extract_sitekey(self, response):
# Try data-sitekey attribute
sitekey = response.css("[data-sitekey]::attr(data-sitekey)").get()
if sitekey:
return sitekey
# Regex fallback
import re
match = re.search(
r'sitekey["\s:=]+["\']([0-9x\-A-Za-z]+)', response.text
)
return match.group(1) if match else None
def _solve_captcha(self, response):
import requests
captcha_type = self._detect_type(response)
sitekey = self._extract_sitekey(response)
if not sitekey:
logger.error("Could not extract sitekey")
return None
try:
resp = requests.post(
"https://api.passxapi.com/solve",
json={
"type": captcha_type,
"sitekey": sitekey,
"url": response.url,
},
headers={"x-api-key": self.api_key},
timeout=30,
)
resp.raise_for_status()
return resp.json().get("token")
except Exception as e:
logger.error(f"CAPTCHA solve failed: {e}")
return None
def _inject_token(self, body, token, response):
captcha_type = self._detect_type(response)
field_map = {
"recaptcha_v2": "g-recaptcha-response",
"hcaptcha": "h-captcha-response",
"turnstile": "cf-turnstile-response",
}
field = field_map.get(captcha_type, "g-recaptcha-response")
if isinstance(body, bytes):
body = body.decode("utf-8")
from urllib.parse import urlencode, parse_qs
params = parse_qs(body)
params[field] = [token]
return urlencode(params, doseq=True)
Enabling the Middleware
Add it to your Scrapy settings:
# settings.py
DOWNLOADER_MIDDLEWARES = {
"myproject.captcha_middleware.CaptchaMiddleware": 560,
}
# API key from environment
import os
CAPTCHA_API_KEY = os.getenv("PASSXAPI_KEY")
CAPTCHA_MAX_RETRIES = 3
Priority 560 puts it after the default middlewares (500-550) so it sees the response after redirects and retries are handled.
Your Spiders Stay Clean
Now your spiders don't need to know CAPTCHAs exist:
class ProductSpider(scrapy.Spider):
name = "products"
start_urls = ["https://example.com/products"]
def parse(self, response):
# No CAPTCHA handling needed!
for product in response.css(".product-card"):
yield {
"title": product.css("h2::text").get(),
"price": product.css(".price::text").get(),
"url": product.css("a::attr(href)").get(),
}
next_page = response.css("a.next::attr(href)").get()
if next_page:
yield response.follow(next_page, self.parse)
If the middleware solves a CAPTCHA, the spider receives the clean response as if nothing happened.
Adding Stats and Monitoring
Track how many CAPTCHAs your crawler is hitting:
class CaptchaMiddleware:
# ... (previous code) ...
def close_spider(self, spider):
logger.info(
f"CAPTCHA stats: "
f"detected={self.stats['detected']}, "
f"solved={self.stats['solved']}, "
f"failed={self.stats['failed']}"
)
# Also write to Scrapy stats
spider.crawler.stats.set_value(
"captcha/detected", self.stats["detected"]
)
spider.crawler.stats.set_value(
"captcha/solved", self.stats["solved"]
)
Testing the Middleware
import pytest
from scrapy.http import HtmlResponse, Request
from captcha_middleware import CaptchaMiddleware
@pytest.fixturedef middleware():
mw = CaptchaMiddleware(api_key="test-key")
return mw
def test_detects_recaptcha(middleware):
request = Request("https://example.com")
body = '<div class="g-recaptcha" data-sitekey="abc123"></div>'
response = HtmlResponse(
url="https://example.com",
request=request,
body=body.encode(),
)
assert middleware._has_captcha(response) is True
assert middleware._detect_type(response) == "recaptcha_v2"
assert middleware._extract_sitekey(response) == "abc123"
def test_passes_clean_pages(middleware):
request = Request("https://example.com")
body = "<html><body>Normal page</body></html>"
response = HtmlResponse(
url="https://example.com",
request=request,
body=body.encode(),
)
result = middleware.process_response(request, response, None)
assert result is response # unchanged
When NOT to Use This Approach
This middleware is great for:
- Large crawls where CAPTCHAs appear intermittently
- Multi-spider projects where you want consistent handling
- Production crawlers that need to be resilient
Consider alternatives when:
- Every single request gets a CAPTCHA (your IP is probably blocked)
- You need to solve CAPTCHAs on JavaScript-rendered pages (use Playwright instead)
- The site uses invisible CAPTCHAs that don't appear in the HTML source
Wrapping Up
The key insight: CAPTCHA handling is infrastructure, not business logic. Push it into a middleware and your spiders stay focused on data extraction.
Full SDK with support for all CAPTCHA types: passxapi-python on GitHub
Do you use middlewares for cross-cutting concerns in Scrapy? What other middleware patterns have worked for you?
Top comments (0)