You built a scraper that handles CAPTCHAs. It works. But how do you test it without burning money on solves every time you run your test suite?
This is a real problem. You need tests that are fast, cheap, and still catch real bugs. Let's build a testing strategy that covers all the layers — from unit tests with mocks to integration tests with real solves.
The Testing Pyramid for Scrapers
/\
/ \ E2E (real sites + real solves)
/----\
/ \ Integration (real API, test sitekeys)
/--------\
/ \ Unit (mocked everything)
/____________\
Most of your tests should be unit tests. A few integration tests. Rare E2E tests (maybe in CI nightly).
Layer 1: Unit Tests with Mocked CAPTCHA Responses
Test your scraper logic without any network calls:
# tests/test_scraper_unit.py
import pytest
from unittest.mock import AsyncMock, patch
from scraper import PageScraper
from captcha import CaptchaToken
@pytest.fixturedef mock_solver():
"""Mock CAPTCHA solver that returns instantly."""
solver = AsyncMock()
solver.solve.return_value = CaptchaToken(
value="mock-token-abc123",
ttl=120
)
return solver
@pytest.fixturedef scraper(mock_solver):
return PageScraper(captcha_solver=mock_solver)
class TestCaptchaDetection:
"""Test that we correctly identify CAPTCHAs in HTML."""
def test_detects_recaptcha_v2(self, scraper):
html = """
<div class="g-recaptcha"
data-sitekey="6Le-wvkSAAAAAPBM...">
</div>
"""
result = scraper.detect_captcha(html)
assert result["type"] == "recaptcha_v2"
assert result["sitekey"] == "6Le-wvkSAAAAAPBM..."
def test_detects_hcaptcha(self, scraper):
html = """
<div class="h-captcha"
data-sitekey="abc-123-def">
</div>
"""
result = scraper.detect_captcha(html)
assert result["type"] == "hcaptcha"
def test_detects_turnstile(self, scraper):
html = """
<div class="cf-turnstile"
data-sitekey="0x4AAAAAAA...">
</div>
"""
result = scraper.detect_captcha(html)
assert result["type"] == "turnstile"
def test_no_captcha_returns_none(self, scraper):
html = "<html><body><form>No captcha here</form></body></html>"
assert scraper.detect_captcha(html) is None
class TestTokenInjection:
"""Test that tokens get injected correctly."""
@pytest.mark.asyncio async def test_injects_token_into_form(self, scraper, mock_solver):
# Mock the page response
with patch.object(
scraper, "fetch_page",
return_value=SAMPLE_HTML_WITH_CAPTCHA
):
result = await scraper.scrape("https://example.com")
# Verify solver was called with correct params
mock_solver.solve.assert_called_once_with(
captcha_type="recaptcha_v2",
sitekey="6Le-wvkSAAAAAPBM...",
url="https://example.com"
)
@pytest.mark.asyncio async def test_skips_solve_when_no_captcha(
self, scraper, mock_solver
):
with patch.object(
scraper, "fetch_page",
return_value="<html>No captcha</html>"
):
await scraper.scrape("https://example.com")
mock_solver.solve.assert_not_called()
Layer 2: Testing Error Recovery
Your retry logic is critical. Test every failure mode:
# tests/test_error_handling.py
import pytest
from unittest.mock import AsyncMock
from captcha import (
CaptchaSolver,
TimeoutError,
InvalidTypeError,
TokenExpiredError
)
class TestRetryBehavior:
@pytest.mark.asyncio async def test_retries_on_timeout(self):
solver = CaptchaSolver()
solver._do_solve = AsyncMock(
side_effect=[
TimeoutError("Timed out"),
TimeoutError("Timed out"),
"valid-token-123" # Succeeds on 3rd try
]
)
token = await solver.solve(
"recaptcha_v2", "sitekey", "https://example.com"
)
assert token.value == "valid-token-123"
assert solver._do_solve.call_count == 3
@pytest.mark.asyncio async def test_fails_after_max_retries(self):
solver = CaptchaSolver(max_retries=3)
solver._do_solve = AsyncMock(
side_effect=TimeoutError("Always fails")
)
with pytest.raises(TimeoutError):
await solver.solve(
"recaptcha_v2", "sitekey",
"https://example.com"
)
assert solver._do_solve.call_count == 3
@pytest.mark.asyncio async def test_no_retry_on_invalid_api_key(self):
"""Fatal errors should not be retried."""
solver = CaptchaSolver()
solver._do_solve = AsyncMock(
side_effect=AuthenticationError("Invalid API key")
)
with pytest.raises(AuthenticationError):
await solver.solve(
"recaptcha_v2", "sitekey",
"https://example.com"
)
# Should fail immediately, no retries
assert solver._do_solve.call_count == 1
class TestCircuitBreaker:
@pytest.mark.asyncio async def test_opens_after_threshold(self):
solver = CaptchaSolver(
circuit_failure_threshold=3
)
solver._do_solve = AsyncMock(
side_effect=TimeoutError("Down")
)
# Fail 3 times to trip the circuit
for _ in range(3):
with pytest.raises(TimeoutError):
await solver.solve(
"recaptcha_v2", "sk", "https://ex.com"
)
# 4th call should fail immediately (circuit open)
with pytest.raises(CircuitOpenError):
await solver.solve(
"recaptcha_v2", "sk", "https://ex.com"
)
Layer 3: Testing Token Expiry Logic
Tokens expire. Your tests should verify you handle that:
# tests/test_token_freshness.py
import pytest
import time
from unittest.mock import patch
from captcha import CaptchaToken
class TestTokenExpiry:
def test_fresh_token_is_valid(self):
token = CaptchaToken(value="abc", ttl=120)
assert token.is_valid is True
assert token.remaining_seconds > 119
def test_expired_token_is_invalid(self):
token = CaptchaToken(value="abc", ttl=120)
# Fast-forward time
with patch("time.time", return_value=time.time() + 121):
assert token.is_valid is False
assert token.remaining_seconds == 0
def test_almost_expired_token(self):
"""Token with < 10s left should be treated
as expired for safety."""
token = CaptchaToken(value="abc", ttl=120)
with patch(
"time.time",
return_value=time.time() + 115
):
assert token.remaining_seconds < 10
# Your code should re-solve, not submit
Layer 4: Integration Tests with Test Sitekeys
Google and hCaptcha provide test sitekeys that always pass. Use them for integration tests:
# tests/test_integration.py
import pytest
import os
# Skip if no API key (in CI, set as secret)
pytestmark = pytest.mark.skipif(
not os.getenv("PASSXAPI_KEY"),
reason="No API key for integration tests"
)
# Official test sitekeys (always solvable)
TEST_SITEKEYS = {
"recaptcha_v2": "6LeIxAcTAAAAAJcZVRqyHh71UMIEGNQ_MXjiZKhI",
"hcaptcha": "10000000-ffff-ffff-ffff-000000000001",
}
class TestRealSolves:
"""Integration tests that hit the real API.
Run sparingly — these cost money."""
@pytest.mark.asyncio @pytest.mark.slow async def test_solve_recaptcha_v2_test_key(self):
import httpx
async with httpx.AsyncClient(
base_url="https://www.passxapi.com"
) as client:
resp = await client.post("/api/v1/task", json={
"type": "recaptcha_v2",
"sitekey": TEST_SITEKEYS["recaptcha_v2"],
"pageurl": "https://example.com"
})
task = resp.json()
assert "task_id" in task
# Poll for result
import asyncio
for _ in range(30):
result = await client.get(
f"/api/v1/task/{task['task_id']}"
)
data = result.json()
if data["status"] == "completed":
assert len(data["token"]) > 50
return
await asyncio.sleep(2)
pytest.fail("Solve timed out")
@pytest.mark.asyncio @pytest.mark.slow async def test_solve_hcaptcha_test_key(self):
# Similar pattern with hCaptcha test key
...
Layer 5: Fixture Factory Pattern
Create reusable fixtures for common CAPTCHA scenarios:
# tests/conftest.py
import pytest
from unittest.mock import AsyncMock
@pytest.fixturedef captcha_html_factory():
"""Generate HTML pages with different CAPTCHAs."""
def _make(
captcha_type: str = "recaptcha_v2",
sitekey: str = "test-key-123"
) -> str:
templates = {
"recaptcha_v2": f"""
<div class="g-recaptcha"
data-sitekey="{sitekey}"></div>
""",
"hcaptcha": f"""
<div class="h-captcha"
data-sitekey="{sitekey}"></div>
""",
"turnstile": f"""
<div class="cf-turnstile"
data-sitekey="{sitekey}"></div>
""",
"none": "<form><input type='submit'></form>"
}
body = templates.get(captcha_type, templates["none"])
return f"<html><body>{body}</body></html>"
return _make
@pytest.fixturedef slow_solver():
"""Solver that simulates network delay."""
import asyncio
async def _slow_solve(*args, **kwargs):
await asyncio.sleep(0.1) # Simulate API latency
return "delayed-token-xyz"
solver = AsyncMock(side_effect=_slow_solve)
return solver
@pytest.fixturedef flaky_solver():
"""Solver that fails 50% of the time."""
call_count = 0
async def _flaky(*args, **kwargs):
nonlocal call_count
call_count += 1
if call_count % 2 == 0:
raise TimeoutError("Randomly failed")
return f"token-attempt-{call_count}"
return AsyncMock(side_effect=_flaky)
Running Your Test Suite
Set up pytest.ini to separate fast and slow tests:
# pytest.ini
[pytest]
markers =
slow: integration tests that hit real APIs
# Default: skip slow tests
addopts = -m "not slow" --tb=short -q
# Fast unit tests (daily, in pre-commit)
pytest
# Full suite with integration tests (nightly CI)
PASSXAPI_KEY=your_key pytest -m "" --tb=long
# Just the CAPTCHA detection tests
pytest tests/test_scraper_unit.py::TestCaptchaDetection -v
Key Takeaways
- Unit test your detection logic — parse HTML, find sitekeys, classify types
- Mock the solver for speed — don't hit APIs in unit tests
- Test every failure mode — timeout, wrong type, expired tokens, circuit breaker
- Use test sitekeys for integration tests — they're free and always solvable
- Separate fast/slow tests with pytest markers
- Fixture factories make it easy to generate test scenarios
For the CAPTCHA-solving client used in these examples, check out passxapi-python — it provides a clean async interface that's easy to mock in tests.
How do you test your scrapers? Share your approach in the comments.
Top comments (0)