DEV Community

Alex Chen
Alex Chen

Posted on

Testing Your Scraper: How to Write pytest Tests for Code That Hits CAPTCHAs

You built a scraper that handles CAPTCHAs. It works. But how do you test it without burning money on solves every time you run your test suite?

This is a real problem. You need tests that are fast, cheap, and still catch real bugs. Let's build a testing strategy that covers all the layers — from unit tests with mocks to integration tests with real solves.

The Testing Pyramid for Scrapers

        /\
       /  \    E2E (real sites + real solves)
      /----\   
     /      \  Integration (real API, test sitekeys)
    /--------\
   /          \ Unit (mocked everything)
  /____________\
Enter fullscreen mode Exit fullscreen mode

Most of your tests should be unit tests. A few integration tests. Rare E2E tests (maybe in CI nightly).

Layer 1: Unit Tests with Mocked CAPTCHA Responses

Test your scraper logic without any network calls:

# tests/test_scraper_unit.py
import pytest
from unittest.mock import AsyncMock, patch

from scraper import PageScraper
from captcha import CaptchaToken

@pytest.fixturedef mock_solver():
    """Mock CAPTCHA solver that returns instantly."""
    solver = AsyncMock()
    solver.solve.return_value = CaptchaToken(
        value="mock-token-abc123",
        ttl=120
    )
    return solver

@pytest.fixturedef scraper(mock_solver):
    return PageScraper(captcha_solver=mock_solver)


class TestCaptchaDetection:
    """Test that we correctly identify CAPTCHAs in HTML."""

    def test_detects_recaptcha_v2(self, scraper):
        html = """
        <div class="g-recaptcha" 
             data-sitekey="6Le-wvkSAAAAAPBM...">
        </div>
        """
        result = scraper.detect_captcha(html)
        assert result["type"] == "recaptcha_v2"
        assert result["sitekey"] == "6Le-wvkSAAAAAPBM..."

    def test_detects_hcaptcha(self, scraper):
        html = """
        <div class="h-captcha" 
             data-sitekey="abc-123-def">
        </div>
        """
        result = scraper.detect_captcha(html)
        assert result["type"] == "hcaptcha"

    def test_detects_turnstile(self, scraper):
        html = """
        <div class="cf-turnstile" 
             data-sitekey="0x4AAAAAAA...">
        </div>
        """
        result = scraper.detect_captcha(html)
        assert result["type"] == "turnstile"

    def test_no_captcha_returns_none(self, scraper):
        html = "<html><body><form>No captcha here</form></body></html>"
        assert scraper.detect_captcha(html) is None


class TestTokenInjection:
    """Test that tokens get injected correctly."""

    @pytest.mark.asyncio    async def test_injects_token_into_form(self, scraper, mock_solver):
        # Mock the page response
        with patch.object(
            scraper, "fetch_page", 
            return_value=SAMPLE_HTML_WITH_CAPTCHA
        ):
            result = await scraper.scrape("https://example.com")

            # Verify solver was called with correct params
            mock_solver.solve.assert_called_once_with(
                captcha_type="recaptcha_v2",
                sitekey="6Le-wvkSAAAAAPBM...",
                url="https://example.com"
            )

    @pytest.mark.asyncio    async def test_skips_solve_when_no_captcha(
        self, scraper, mock_solver
    ):
        with patch.object(
            scraper, "fetch_page", 
            return_value="<html>No captcha</html>"
        ):
            await scraper.scrape("https://example.com")
            mock_solver.solve.assert_not_called()
Enter fullscreen mode Exit fullscreen mode

Layer 2: Testing Error Recovery

Your retry logic is critical. Test every failure mode:

# tests/test_error_handling.py
import pytest
from unittest.mock import AsyncMock
from captcha import (
    CaptchaSolver, 
    TimeoutError, 
    InvalidTypeError,
    TokenExpiredError
)

class TestRetryBehavior:

    @pytest.mark.asyncio    async def test_retries_on_timeout(self):
        solver = CaptchaSolver()
        solver._do_solve = AsyncMock(
            side_effect=[
                TimeoutError("Timed out"),
                TimeoutError("Timed out"),
                "valid-token-123"  # Succeeds on 3rd try
            ]
        )

        token = await solver.solve(
            "recaptcha_v2", "sitekey", "https://example.com"
        )
        assert token.value == "valid-token-123"
        assert solver._do_solve.call_count == 3

    @pytest.mark.asyncio    async def test_fails_after_max_retries(self):
        solver = CaptchaSolver(max_retries=3)
        solver._do_solve = AsyncMock(
            side_effect=TimeoutError("Always fails")
        )

        with pytest.raises(TimeoutError):
            await solver.solve(
                "recaptcha_v2", "sitekey", 
                "https://example.com"
            )
        assert solver._do_solve.call_count == 3

    @pytest.mark.asyncio    async def test_no_retry_on_invalid_api_key(self):
        """Fatal errors should not be retried."""
        solver = CaptchaSolver()
        solver._do_solve = AsyncMock(
            side_effect=AuthenticationError("Invalid API key")
        )

        with pytest.raises(AuthenticationError):
            await solver.solve(
                "recaptcha_v2", "sitekey", 
                "https://example.com"
            )
        # Should fail immediately, no retries
        assert solver._do_solve.call_count == 1


class TestCircuitBreaker:

    @pytest.mark.asyncio    async def test_opens_after_threshold(self):
        solver = CaptchaSolver(
            circuit_failure_threshold=3
        )
        solver._do_solve = AsyncMock(
            side_effect=TimeoutError("Down")
        )

        # Fail 3 times to trip the circuit
        for _ in range(3):
            with pytest.raises(TimeoutError):
                await solver.solve(
                    "recaptcha_v2", "sk", "https://ex.com"
                )

        # 4th call should fail immediately (circuit open)
        with pytest.raises(CircuitOpenError):
            await solver.solve(
                "recaptcha_v2", "sk", "https://ex.com"
            )
Enter fullscreen mode Exit fullscreen mode

Layer 3: Testing Token Expiry Logic

Tokens expire. Your tests should verify you handle that:

# tests/test_token_freshness.py
import pytest
import time
from unittest.mock import patch
from captcha import CaptchaToken

class TestTokenExpiry:

    def test_fresh_token_is_valid(self):
        token = CaptchaToken(value="abc", ttl=120)
        assert token.is_valid is True
        assert token.remaining_seconds > 119

    def test_expired_token_is_invalid(self):
        token = CaptchaToken(value="abc", ttl=120)
        # Fast-forward time
        with patch("time.time", return_value=time.time() + 121):
            assert token.is_valid is False
            assert token.remaining_seconds == 0

    def test_almost_expired_token(self):
        """Token with < 10s left should be treated 
        as expired for safety."""
        token = CaptchaToken(value="abc", ttl=120)
        with patch(
            "time.time", 
            return_value=time.time() + 115
        ):
            assert token.remaining_seconds < 10
            # Your code should re-solve, not submit
Enter fullscreen mode Exit fullscreen mode

Layer 4: Integration Tests with Test Sitekeys

Google and hCaptcha provide test sitekeys that always pass. Use them for integration tests:

# tests/test_integration.py
import pytest
import os

# Skip if no API key (in CI, set as secret)
pytestmark = pytest.mark.skipif(
    not os.getenv("PASSXAPI_KEY"),
    reason="No API key for integration tests"
)

# Official test sitekeys (always solvable)
TEST_SITEKEYS = {
    "recaptcha_v2": "6LeIxAcTAAAAAJcZVRqyHh71UMIEGNQ_MXjiZKhI",
    "hcaptcha": "10000000-ffff-ffff-ffff-000000000001",
}

class TestRealSolves:
    """Integration tests that hit the real API.
    Run sparingly — these cost money."""

    @pytest.mark.asyncio    @pytest.mark.slow    async def test_solve_recaptcha_v2_test_key(self):
        import httpx

        async with httpx.AsyncClient(
            base_url="https://www.passxapi.com"
        ) as client:
            resp = await client.post("/api/v1/task", json={
                "type": "recaptcha_v2",
                "sitekey": TEST_SITEKEYS["recaptcha_v2"],
                "pageurl": "https://example.com"
            })
            task = resp.json()
            assert "task_id" in task

            # Poll for result
            import asyncio
            for _ in range(30):
                result = await client.get(
                    f"/api/v1/task/{task['task_id']}"
                )
                data = result.json()
                if data["status"] == "completed":
                    assert len(data["token"]) > 50
                    return
                await asyncio.sleep(2)

            pytest.fail("Solve timed out")

    @pytest.mark.asyncio    @pytest.mark.slow    async def test_solve_hcaptcha_test_key(self):
        # Similar pattern with hCaptcha test key
        ...
Enter fullscreen mode Exit fullscreen mode

Layer 5: Fixture Factory Pattern

Create reusable fixtures for common CAPTCHA scenarios:

# tests/conftest.py
import pytest
from unittest.mock import AsyncMock

@pytest.fixturedef captcha_html_factory():
    """Generate HTML pages with different CAPTCHAs."""
    def _make(
        captcha_type: str = "recaptcha_v2",
        sitekey: str = "test-key-123"
    ) -> str:
        templates = {
            "recaptcha_v2": f"""
                <div class="g-recaptcha" 
                     data-sitekey="{sitekey}"></div>
            """,
            "hcaptcha": f"""
                <div class="h-captcha" 
                     data-sitekey="{sitekey}"></div>
            """,
            "turnstile": f"""
                <div class="cf-turnstile" 
                     data-sitekey="{sitekey}"></div>
            """,
            "none": "<form><input type='submit'></form>"
        }
        body = templates.get(captcha_type, templates["none"])
        return f"<html><body>{body}</body></html>"

    return _make


@pytest.fixturedef slow_solver():
    """Solver that simulates network delay."""
    import asyncio

    async def _slow_solve(*args, **kwargs):
        await asyncio.sleep(0.1)  # Simulate API latency
        return "delayed-token-xyz"

    solver = AsyncMock(side_effect=_slow_solve)
    return solver


@pytest.fixturedef flaky_solver():
    """Solver that fails 50% of the time."""
    call_count = 0

    async def _flaky(*args, **kwargs):
        nonlocal call_count
        call_count += 1
        if call_count % 2 == 0:
            raise TimeoutError("Randomly failed")
        return f"token-attempt-{call_count}"

    return AsyncMock(side_effect=_flaky)
Enter fullscreen mode Exit fullscreen mode

Running Your Test Suite

Set up pytest.ini to separate fast and slow tests:

# pytest.ini
[pytest]
markers =
    slow: integration tests that hit real APIs

# Default: skip slow tests
addopts = -m "not slow" --tb=short -q
Enter fullscreen mode Exit fullscreen mode
# Fast unit tests (daily, in pre-commit)
pytest

# Full suite with integration tests (nightly CI)
PASSXAPI_KEY=your_key pytest -m "" --tb=long

# Just the CAPTCHA detection tests
pytest tests/test_scraper_unit.py::TestCaptchaDetection -v
Enter fullscreen mode Exit fullscreen mode

Key Takeaways

  1. Unit test your detection logic — parse HTML, find sitekeys, classify types
  2. Mock the solver for speed — don't hit APIs in unit tests
  3. Test every failure mode — timeout, wrong type, expired tokens, circuit breaker
  4. Use test sitekeys for integration tests — they're free and always solvable
  5. Separate fast/slow tests with pytest markers
  6. Fixture factories make it easy to generate test scenarios

For the CAPTCHA-solving client used in these examples, check out passxapi-python — it provides a clean async interface that's easy to mock in tests.


How do you test your scrapers? Share your approach in the comments.

Top comments (0)