Python web scraping has a reputation problem. Every tutorial shows you the 10-line BeautifulSoup example that works great... until you try it on a real site.
Then you hit:
- 403 Forbidden
- Empty responses (JavaScript-rendered content)
- Rate limiting after 50 requests
- CAPTCHAs
- IP bans
I've built scrapers professionally for years. Here's what actually works.
The Stack
For most scraping projects you need exactly two things:
pip install requests beautifulsoup4 lxml playwright
playwright install chromium
requests + beautifulsoup4 for static HTML. playwright for JavaScript-heavy sites. That's it.
Part 1: The Right Way to Make Requests
Most beginners do this:
import requests
response = requests.get('https://example.com/products')
Real sites will block you within minutes. Here's what you actually need:
import requests
import time
import random
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xhtml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
def get_page(url, session, retries=3):
for attempt in range(retries):
try:
# Random delay — looks human, avoids rate limits
time.sleep(random.uniform(1.5, 4.0))
response = session.get(url, headers=HEADERS, timeout=10)
response.raise_for_status()
return response
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
# Rate limited — back off exponentially
wait = (2 ** attempt) * 10
print(f"Rate limited. Waiting {wait}s...")
time.sleep(wait)
elif e.response.status_code in (403, 503):
print(f"Blocked on attempt {attempt + 1}")
time.sleep(random.uniform(5, 15))
else:
raise
return None
# Use a Session to maintain cookies across requests
with requests.Session() as session:
page = get_page('https://example.com/products', session)
Key points:
-
Use a real browser User-Agent — the default
python-requests/2.x.xis an instant flag - Random delays — uniform distribution looks more human than fixed delays
- Exponential backoff on 429s — respect rate limits or get IP-banned
- Sessions — cookies persist, which many sites require for navigation
Part 2: Parsing HTML with BeautifulSoup
Once you have the HTML, parsing is straightforward:
from bs4 import BeautifulSoup
def parse_products(html):
soup = BeautifulSoup(html, 'lxml') # lxml is faster than html.parser
products = []
# Find all product cards
for card in soup.select('.product-card'):
product = {
'name': card.select_one('.product-title').get_text(strip=True),
'price': card.select_one('.price').get_text(strip=True),
'url': card.select_one('a')['href'],
# Handle missing elements gracefully
'rating': card.select_one('.rating').get_text(strip=True) if card.select_one('.rating') else None,
}
products.append(product)
return products
Always use .select_one() with a fallback for optional fields. Sites change their HTML. Your scraper needs to handle missing elements without crashing.
Part 3: Handling JavaScript-Rendered Sites
~40% of modern sites render their content with JavaScript. requests gets the empty skeleton. You need a real browser.
from playwright.sync_api import sync_playwright
import time
def scrape_js_site(url):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
# Use a real viewport size
viewport={'width': 1920, 'height': 1080},
# Pass real browser headers
extra_http_headers={
'Accept-Language': 'en-US,en;q=0.9',
}
)
page = context.new_page()
# Navigate and wait for content to load
page.goto(url)
page.wait_for_selector('.product-card', timeout=10000)
# Optional: scroll to trigger lazy loading
page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(2)
# Get the fully-rendered HTML
html = page.content()
browser.close()
return html
When to use Playwright vs requests:
-
requests: static HTML, APIs, anything that works incurl -
playwright: React/Vue/Angular apps, infinite scroll, login flows, anything that needs JS
Part 4: Pagination
Most real scraping jobs involve multiple pages. Here's the pattern:
def scrape_all_pages(base_url):
all_items = []
page_num = 1
with requests.Session() as session:
while True:
url = f"{base_url}?page={page_num}"
response = get_page(url, session)
if not response:
break
soup = BeautifulSoup(response.text, 'lxml')
items = parse_products(response.text)
if not items:
break # No more results
all_items.extend(items)
print(f"Page {page_num}: {len(items)} items (total: {len(all_items)})")
# Check for next page link
next_btn = soup.select_one('a[rel="next"]')
if not next_btn:
break
page_num += 1
return all_items
Part 5: Storing the Data
Don't just print results. Save them properly from the start:
import csv
import json
import sqlite3
from datetime import datetime
def save_to_csv(items, filename):
if not items:
return
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=items[0].keys())
writer.writeheader()
writer.writerows(items)
print(f"Saved {len(items)} items to {filename}")
def save_to_sqlite(items, db_path, table_name):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Create table from first item's keys
if items:
columns = ', '.join([f"{k} TEXT" for k in items[0].keys()])
cursor.execute(f"CREATE TABLE IF NOT EXISTS {table_name} ({columns}, scraped_at TEXT)")
for item in items:
values = list(item.values()) + [datetime.now().isoformat()]
placeholders = ', '.join(['?' for _ in values])
cursor.execute(f"INSERT INTO {table_name} VALUES ({placeholders})", values)
conn.commit()
conn.close()
SQLite is underused for scraping. It's zero-setup, handles millions of rows, and makes deduplication easy.
Common Mistakes
Mistake 1: Scraping what you should be using an API for
Check for a public API before scraping. robots.txt before scraping too — it tells you what the site allows.
Mistake 2: Not handling network errors
Networks are flaky. Always wrap requests in try/except and implement retry logic (like the get_page() function above).
Mistake 3: Running requests in a tight loop
No delay = IP ban within minutes. time.sleep(random.uniform(1.5, 4.0)) between requests is the minimum.
Mistake 4: Storing data as you go vs. collecting then storing
If your scraper crashes on page 47, you lose everything. Store incrementally or checkpoint frequently.
Putting It Together
Here's a complete, production-ready scraper for a static site:
import requests
import time
import random
import csv
from bs4 import BeautifulSoup
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xhtml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
def polite_get(session, url, min_delay=1.5, max_delay=4.0):
time.sleep(random.uniform(min_delay, max_delay))
try:
r = session.get(url, headers=HEADERS, timeout=15)
r.raise_for_status()
return r
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
def scrape_site(start_url, output_file='results.csv'):
results = []
with requests.Session() as session:
page = 1
while True:
url = f"{start_url}?page={page}"
response = polite_get(session, url)
if not response:
break
soup = BeautifulSoup(response.text, 'lxml')
# Parse your target elements
for item in soup.select('.item'):
results.append({
'title': item.select_one('h2').get_text(strip=True),
'link': item.select_one('a')['href'],
'description': item.select_one('p').get_text(strip=True) if item.select_one('p') else '',
})
# Check for next page
if not soup.select_one('.pagination .next'):
break
page += 1
print(f"Scraped page {page - 1}, {len(results)} total items")
# Save results
if results:
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=results[0].keys())
writer.writeheader()
writer.writerows(results)
print(f"Done. {len(results)} items saved to {output_file}")
if __name__ == '__main__':
scrape_site('https://example.com/products')
What's Next
This guide covers the fundamentals. Real-world projects also need:
- Proxy rotation when one IP isn't enough
- CAPTCHA handling (2captcha, anti-captcha APIs)
- Distributed scraping across multiple machines
- Incremental runs that only fetch new data
- Monitoring so you know when the site changed its structure
If you're building production scrapers and want a head start, I packaged up the scripts I reuse across projects into a Web Scraping Starter Kit — proxy rotation, Playwright integration, unified storage, and anti-detection headers included.
Happy scraping. Be polite to the servers.
Top comments (0)