Python Web Scraper: Extract Data from Any Website in 50 Lines
Web scraping is one of the most practical Python skills. Here's a production-ready scraper that handles most websites.
Setup
pip install requests beautifulsoup4 lxml
Core Scraper
import requests
from bs4 import BeautifulSoup
import json
import time
def scrape_page(url: str) -> BeautifulSoup:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
return BeautifulSoup(response.text, 'lxml')
# Example: Scrape Hacker News front page
def scrape_hackernews():
soup = scrape_page('https://news.ycombinator.com')
stories = []
for item in soup.select('.athing'):
title_el = item.select_one('.titleline > a')
if title_el:
stories.append({
'title': title_el.text,
'url': title_el.get('href', ''),
})
return stories
stories = scrape_hackernews()
for story in stories[:5]:
print(f"- {story['title'][:60]}")
Handle Pagination
def scrape_all_pages(base_url: str, max_pages: int = 10):
results = []
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
soup = scrape_page(url)
items = soup.select('.item')
if not items:
break
for item in items:
results.append({
'title': item.select_one('h2').text.strip(),
'link': item.select_one('a')['href']
})
time.sleep(1) # Be polite
return results
Save to CSV
import csv
def save_to_csv(data: list, filename: str):
if not data:
return
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
print(f"Saved {len(data)} rows to {filename}")
results = scrape_all_pages('https://example.com/products')
save_to_csv(results, 'products.csv')
For JavaScript-Heavy Sites
from playwright.sync_api import sync_playwright
def scrape_js_site(url: str):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url, wait_until='networkidle')
content = page.content()
browser.close()
return BeautifulSoup(content, 'lxml')
Retry Logic
def scrape_with_retry(url: str, max_retries: int = 3):
for attempt in range(max_retries):
try:
return scrape_page(url)
except requests.RequestException as e:
if attempt == max_retries - 1:
raise
print(f"Retry {attempt + 1}: {e}")
time.sleep(2 ** attempt) # Exponential backoff
Want More Scripts?
Python Automation Toolkit → — 25+ scripts including web scrapers, file organizers, email automators. $9 one-time.
Top comments (0)