Web scraping is one of the most useful skills a developer can have. From price monitoring to lead generation, data collection powers countless applications. Here's how to do it right.
Two Approaches
- BeautifulSoup — for static HTML pages (fast, simple)
- Selenium — for JavaScript-rendered pages (handles dynamic content)
BeautifulSoup: Static Pages
pip install requests beautifulsoup4 lxml
import requests
from bs4 import BeautifulSoup
import json
def scrape_articles(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'lxml')
articles = []
for article in soup.find_all('article'):
title = article.find('h2')
link = article.find('a')
description = article.find('p')
if title and link:
articles.append({
"title": title.get_text(strip=True),
"url": link.get('href'),
"description": description.get_text(strip=True) if description else ""
})
return articles
results = scrape_articles("https://news.ycombinator.com")
print(json.dumps(results[:5], indent=2))
Extracting Specific Data
def scrape_product_info(url):
soup = BeautifulSoup(requests.get(url).content, 'lxml')
return {
"name": soup.select_one('h1.product-title')?.get_text(strip=True),
"price": soup.select_one('.price')?.get_text(strip=True),
"rating": soup.select_one('[data-rating]')?.get('data-rating'),
"images": [img['src'] for img in soup.select('.product-image img')]
}
Handling Pagination
def scrape_all_pages(base_url, max_pages=10):
all_data = []
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
response = requests.get(url)
if response.status_code != 200:
break
soup = BeautifulSoup(response.content, 'lxml')
items = soup.find_all('div', class_='item')
if not items: # No more results
break
for item in items:
all_data.append(item.get_text(strip=True))
time.sleep(1) # Be respectful
return all_data
Selenium: Dynamic JavaScript Pages
pip install selenium webdriver-manager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
def setup_driver(headless=True):
options = webdriver.ChromeOptions()
if headless:
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(
service=webdriver.ChromeService(ChromeDriverManager().install()),
options=options
)
return driver
def scrape_dynamic_page(url):
driver = setup_driver()
try:
driver.get(url)
# Wait for content to load
wait = WebDriverWait(driver, 10)
content = wait.until(
EC.presence_of_element_located((By.CLASS_NAME, 'content'))
)
# Scroll to load lazy content
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# Extract data
items = driver.find_elements(By.CSS_SELECTOR, '.item')
return [item.text for item in items]
finally:
driver.quit()
Saving Scraped Data
import pandas as pd
# Save to CSV
def save_to_csv(data, filename):
df = pd.DataFrame(data)
df.to_csv(filename, index=False)
print(f"Saved {len(data)} records to {filename}")
# Save to JSON
def save_to_json(data, filename):
with open(filename, 'w') as f:
json.dump(data, f, indent=2)
Rate Limiting and Ethics
import time
import random
def polite_request(url, min_delay=1, max_delay=3):
"""Be a good web citizen"""
time.sleep(random.uniform(min_delay, max_delay))
response = requests.get(url, headers={
"User-Agent": "My Research Bot/1.0 (contact@example.com)"
})
return response
Always:
- Check robots.txt before scraping
- Add delays between requests
- Respect rate limits
- Don't scrape personal data without permission
Use Cases That Pay
- Price monitoring for e-commerce
- Lead generation from directories
- Competitive analysis for businesses
- Real estate listings aggregation
- Job board scraping for recruitment
Need Automated Data Collection?
Have a scraping or automation project in mind?
- Python Automation Services - Custom scrapers from $20
- AI Content Generation - Automated data pipelines from $20
What data are you trying to collect? Ask in the comments!
Top comments (0)