You're running a scraper at scale. You've got proxy rotation to avoid IP bans, and CAPTCHA solving for when challenges pop up. But using them together has some non-obvious gotchas that can waste money and tank your success rate.
Here's what I've learned from running scrapers that process 50K+ pages daily.
The Counter-Intuitive Problem
Most developers think: "More proxy rotation = fewer CAPTCHAs." The reality is often the opposite.
Every time you rotate to a new IP, you lose your trust score with the target site. New IPs are suspicious by default. The result: you get MORE CAPTCHAs, not fewer.
# BAD: Rotate proxy on every request
# Result: CAPTCHA on nearly every request
for url in urls:
proxy = get_random_proxy()
resp = requests.get(url, proxies={"https": proxy})
# CAPTCHA rate: ~60-80%
The Right Strategy: Sticky Sessions
Use one proxy IP for as long as possible. Only rotate when you get blocked:
import httpx
import os
class SmartScraper:
def __init__(self):
self.current_proxy = None
self.request_count = 0
self.max_requests_per_ip = 50
self.proxy_pool = self._load_proxies()
self.solver = CaptchaSolver(api_key=os.getenv("PASSXAPI_KEY"))
def _rotate_proxy(self):
self.current_proxy = next(self.proxy_pool)
self.request_count = 0
return self.current_proxy
def _get_proxy(self):
if (self.current_proxy is None or
self.request_count >= self.max_requests_per_ip):
return self._rotate_proxy()
return self.current_proxy
def scrape(self, url):
proxy = self._get_proxy()
with httpx.Client(proxy=proxy, timeout=30) as client:
resp = client.get(url)
self.request_count += 1
if self._has_captcha(resp):
# Solve CAPTCHA, DON'T rotate proxy
token = self.solver.solve(
captcha_type=self._detect_type(resp),
sitekey=self._extract_key(resp),
url=url
)
resp = client.get(url, params={"token": token})
# Now the session cookie is trusted
# Next requests on this IP won't need CAPTCHAs
if resp.status_code == 403:
# Only rotate on hard blocks
self._rotate_proxy()
return self.scrape(url) # retry with new IP
return resp
Key insight: After solving a CAPTCHA, the session cookie grants trust. Keep using that same IP + cookie combo for subsequent requests.
Session Cookie Management
The CAPTCHA token gives you a session cookie. Preserve it:
class SessionManager:
def __init__(self):
self.sessions = {} # proxy -> session mapping
def get_session(self, proxy_url):
if proxy_url not in self.sessions:
session = httpx.Client(
proxy=proxy_url,
timeout=30,
follow_redirects=True,
)
self.sessions[proxy_url] = {
"client": session,
"requests": 0,
"captchas_solved": 0,
"created_at": time.time(),
}
return self.sessions[proxy_url]
def mark_captcha_solved(self, proxy_url):
"""After solving, this session is now trusted."""
if proxy_url in self.sessions:
self.sessions[proxy_url]["captchas_solved"] += 1
def should_rotate(self, proxy_url):
session = self.sessions.get(proxy_url)
if not session:
return True
# Rotate after 100 requests or 10 minutes
if session["requests"] > 100:
return True
if time.time() - session["created_at"] > 600:
return True
return False
def cleanup(self, proxy_url):
if proxy_url in self.sessions:
self.sessions[proxy_url]["client"].close()
del self.sessions[proxy_url]
Proxy Types and CAPTCHA Rates
Not all proxies are equal when it comes to CAPTCHAs:
| Proxy Type | Cost | CAPTCHA Rate | Best For |
|---|---|---|---|
| Datacenter | \$1-5/GB | 40-70% | High volume, low value |
| Residential | \$5-15/GB | 10-25% | Most scraping |
| ISP/Static | \$2-5/IP/mo | 5-15% | Long sessions |
| Mobile | \$20-50/GB | 2-8% | Hardest targets |
The math: Residential proxies cost more per GB but trigger fewer CAPTCHAs. If each CAPTCHA solve costs \$0.001, the total cost can be lower with better proxies.
Cost Optimization Formula
def calculate_cost_per_page(proxy_cost_per_gb, captcha_rate,
avg_page_size_mb=0.5,
captcha_cost=0.001):
"""
Calculate true cost per page including CAPTCHA solves.
"""
proxy_cost_per_page = proxy_cost_per_gb * (avg_page_size_mb / 1024)
captcha_cost_per_page = captcha_rate * captcha_cost
total = proxy_cost_per_page + captcha_cost_per_page
return total
# Datacenter: $2/GB, 60% CAPTCHA rate
dc_cost = calculate_cost_per_page(2, 0.60)
# = $0.00098 + $0.0006 = $0.00158/page
# Residential: $10/GB, 15% CAPTCHA rate
res_cost = calculate_cost_per_page(10, 0.15)
# = $0.00488 + $0.00015 = $0.00503/page
# Residential costs more per page, but...
# Datacenter gets blocked more often (not in this calc)
Async Version with Proxy Pool
For production workloads:
import asyncio
import httpx
import random
from itertools import cycle
class AsyncProxyScraper:
def __init__(self, proxies, api_key):
self.proxy_cycle = cycle(proxies)
self.semaphore = asyncio.Semaphore(20)
self.solver = AsyncCaptchaSolver(api_key=api_key)
async def scrape_url(self, url):
async with self.semaphore:
proxy = next(self.proxy_cycle)
async with httpx.AsyncClient(
proxy=proxy,
timeout=30,
follow_redirects=True
) as client:
resp = await client.get(url)
if self._has_captcha(resp.text):
sitekey = self._extract_sitekey(resp.text)
token = await self.solver.solve(
captcha_type="recaptcha_v2",
sitekey=sitekey,
url=url
)
# Resubmit with token on SAME proxy
resp = await client.post(url, data={
"g-recaptcha-response": token
})
return {"url": url, "status": resp.status_code,
"data": resp.text[:500]}
async def scrape_all(self, urls):
tasks = [self.scrape_url(url) for url in urls]
return await asyncio.gather(*tasks, return_exceptions=True)
def _has_captcha(self, text):
indicators = ["recaptcha", "hcaptcha", "cf-turnstile"]
return any(ind in text.lower() for ind in indicators)
def _extract_sitekey(self, text):
import re
match = re.search(r'data-sitekey="([^"]+)"', text)
return match.group(1) if match else None
Monitoring and Alerting
Track your CAPTCHA rate per proxy to detect problems early:
import logging
from collections import defaultdict
class ProxyMonitor:
def __init__(self):
self.stats = defaultdict(lambda: {
"requests": 0, "captchas": 0, "blocks": 0
})
def record(self, proxy, event_type):
self.stats[proxy]["requests"] += 1
if event_type == "captcha":
self.stats[proxy]["captchas"] += 1
elif event_type == "blocked":
self.stats[proxy]["blocks"] += 1
def get_captcha_rate(self, proxy):
s = self.stats[proxy]
if s["requests"] == 0:
return 0
return s["captchas"] / s["requests"]
def get_bad_proxies(self, threshold=0.5):
"""Proxies with >50% CAPTCHA rate are probably burned."""
bad = []
for proxy, stats in self.stats.items():
rate = self.get_captcha_rate(proxy)
if rate > threshold and stats["requests"] > 10:
bad.append((proxy, rate))
return sorted(bad, key=lambda x: -x[1])
def report(self):
for proxy, stats in self.stats.items():
rate = self.get_captcha_rate(proxy)
logging.info(
f"Proxy {proxy[:30]}... | "
f"Requests: {stats['requests']} | "
f"CAPTCHA rate: {rate:.1%} | "
f"Blocks: {stats['blocks']}"
)
Key Takeaways
- Don't rotate on every request — sticky sessions reduce CAPTCHA rates
- Solve CAPTCHAs on the same IP — the session cookie carries trust
- Rotate only on hard blocks (403, connection refused)
- Monitor per-proxy CAPTCHA rates — burn bad proxies, keep good ones
- Calculate true cost — cheaper proxies + more CAPTCHAs can cost more overall
Resources
- CAPTCHA solving SDK: passxapi-python on GitHub
- Supports reCAPTCHA, hCaptcha, Turnstile, FunCaptcha
- Async-native for high-throughput scraping
What proxy strategy works best for your use case? Share in the comments.
Top comments (0)