Web scraping has evolved significantly. Modern websites employ sophisticated anti-bot systems, dynamic rendering, and legal protections that make naive scraping approaches ineffective. In 2026, successful web scraping requires a thoughtful combination of technical skill, ethical awareness, and robust engineering practices.
This guide covers the current best practices for Python web scraping, from choosing the right tools to handling the challenges of modern web applications.
Choosing the Right Tool for the Job
Static Content: requests + BeautifulSoup
For sites that serve pre-rendered HTML, the classic combination remains the most efficient:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
class BasicScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'en-US,en;q=0.9',
})
def fetch(self, url: str, timeout: int = 15) -> BeautifulSoup:
response = self.session.get(url, timeout=timeout)
response.raise_for_status()
return BeautifulSoup(response.text, 'html.parser')
def extract_links(self, soup: BeautifulSoup, base_url: str) -> list:
links = []
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
if href.startswith('/'):
href = base_url.rstrip('/') + href
links.append({
'url': href,
'text': a_tag.get_text(strip=True)[:100]
})
return links
Dynamic Content: Playwright
For JavaScript-rendered pages, Playwright has become the go-to tool in 2026:
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
class DynamicScraper:
def __init__(self, headless: bool = True):
self.headless = headless
def fetch_rendered(self, url: str, wait_for: str = 'networkidle') -> str:
with sync_playwright() as p:
browser = p.chromium.launch(headless=self.headless)
context = browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36'
)
page = context.new_page()
try:
page.goto(url, wait_until=wait_for, timeout=30000)
# Handle lazy loading
page.evaluate("""
() => {
window.scrollTo(0, document.body.scrollHeight);
}
""")
page.wait_for_timeout(1000)
return page.content()
except PlaywrightTimeout:
# Return whatever we have
return page.content()
finally:
browser.close()
APIs First: Check for Official APIs
Before scraping, always check if the site offers an API. Many sites provide public APIs or have data accessible through their mobile apps:
import requests
class APIFirstScraper:
"""Try API endpoints before falling back to HTML scraping."""
def __init__(self):
self.session = requests.Session()
def try_api(self, base_url: str, endpoint: str) -> dict:
"""Attempt to fetch data from a known API pattern."""
api_patterns = [
f"{base_url}/api/v1/{endpoint}",
f"{base_url}/api/{endpoint}",
f"{base_url}/wp-json/wp/v2/{endpoint}",
]
for api_url in api_patterns:
try:
response = self.session.get(api_url, timeout=5)
if response.status_code == 200:
return response.json()
except requests.RequestException:
continue
return None
Robust Request Handling
Retry with Exponential Backoff
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
import random
def create_resilient_session(
max_retries: int = 3,
backoff_factor: float = 1.0
) -> requests.Session:
session = requests.Session()
retry_strategy = Retry(
total=max_retries,
backoff_factor=backoff_factor,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["GET", "HEAD"]
)
adapter = HTTPAdapter(
max_retries=retry_strategy,
pool_connections=10,
pool_maxsize=100
)
session.mount("https://", adapter)
session.mount("http://", adapter)
return session
Rate Limiting
import time
from threading import Lock
class RateLimiter:
def __init__(self, requests_per_minute: int = 30):
self.min_interval = 60.0 / requests_per_minute
self.last_request_time = 0
self.lock = Lock()
def wait(self):
with self.lock:
now = time.monotonic()
elapsed = now - self.last_request_time
if elapsed < self.min_interval:
sleep_time = self.min_interval - elapsed
# Add jitter to appear more human-like
sleep_time += random.uniform(0, sleep_time * 0.3)
time.sleep(sleep_time)
self.last_request_time = time.monotonic()
Proxy Rotation
import random
class ProxyRotator:
def __init__(self, proxies: list):
self.proxies = proxies
self.failed = set()
self.current_index = 0
def get_proxy(self) -> dict:
available = [p for p in self.proxies
if p not in self.failed]
if not available:
self.failed.clear()
available = self.proxies
proxy = random.choice(available)
return {
'http': f'http://{proxy}',
'https': f'http://{proxy}'
}
def report_failure(self, proxy_url: str):
self.failed.add(proxy_url)
def report_success(self, proxy_url: str):
self.failed.discard(proxy_url)
Data Extraction Best Practices
Use CSS Selectors Over Regex
class RobustExtractor:
"""Extract data using multiple strategies for resilience."""
def extract_price(self, soup: BeautifulSoup) -> float | None:
# Strategy 1: Structured data (JSON-LD)
json_ld = soup.find('script', type='application/ld+json')
if json_ld:
try:
import json
data = json.loads(json_ld.string)
if 'offers' in data and 'price' in data['offers']:
return float(data['offers']['price'])
except (json.JSONDecodeError, KeyError, TypeError):
pass
# Strategy 2: Common CSS selectors
selectors = [
'[itemprop="price"]',
'.price-current',
'[data-price]',
'.product-price .value',
]
for selector in selectors:
element = soup.select_one(selector)
if element:
text = element.get('content') or element.get_text(strip=True)
return self._parse_price(text)
# Strategy 3: Regex fallback
import re
match = re.search(r'[\$€£]\s*([\d,]+\.?\d*)', soup.get_text())
if match:
return float(match.group(1).replace(',', ''))
return None
def _parse_price(self, text: str) -> float | None:
import re
cleaned = re.sub(r'[^\d.,]', '', text)
if not cleaned:
return None
if ',' in cleaned and '.' in cleaned:
cleaned = cleaned.replace(',', '')
elif ',' in cleaned:
cleaned = cleaned.replace(',', '.')
try:
return float(cleaned)
except ValueError:
return None
Handling Anti-Bot Measures
Browser Fingerprint Evasion
from playwright.sync_api import sync_playwright
def create_stealth_context(playwright):
"""Create a browser context that appears more human."""
context = playwright.chromium.launch_persistent_context(
user_data_dir='/tmp/stealth_profile',
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
]
)
# Remove webdriver flag
context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
""")
return context
Cookie and Session Management
class SessionManager:
def __init__(self):
self.cookies = {}
def save_cookies(self, response: requests.Response, domain: str):
self.cookies[domain] = response.cookies.get_dict()
def load_cookies(self, session: requests.Session, domain: str):
if domain in self.cookies:
session.cookies.update(self.cookies[domain])
def save_to_file(self, filepath: str):
import json
with open(filepath, 'w') as f:
json.dump(self.cookies, f)
def load_from_file(self, filepath: str):
import json
try:
with open(filepath) as f:
self.cookies = json.load(f)
except FileNotFoundError:
pass
For the complete guide with all code examples and advanced patterns, read the full article on our blog.
Originally published at WD Tech Blog. Follow for more Python tutorials, AI tools, and developer resources.
Top comments (0)