Web Scraping with Python: BeautifulSoup and Selenium Guide 2025

#python #webdev #automation #tutorial

Web scraping is one of the most useful skills a developer can have. From price monitoring to lead generation, data collection powers countless applications. Here's how to do it right.

Two Approaches

BeautifulSoup — for static HTML pages (fast, simple)
Selenium — for JavaScript-rendered pages (handles dynamic content)

BeautifulSoup: Static Pages

pip install requests beautifulsoup4 lxml

import requests
from bs4 import BeautifulSoup
import json

def scrape_articles(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')

    articles = []
    for article in soup.find_all('article'):
        title = article.find('h2')
        link = article.find('a')
        description = article.find('p')

        if title and link:
            articles.append({
                "title": title.get_text(strip=True),
                "url": link.get('href'),
                "description": description.get_text(strip=True) if description else ""
            })

    return articles

results = scrape_articles("https://news.ycombinator.com")
print(json.dumps(results[:5], indent=2))

Extracting Specific Data

def scrape_product_info(url):
    soup = BeautifulSoup(requests.get(url).content, 'lxml')

    return {
        "name": soup.select_one('h1.product-title')?.get_text(strip=True),
        "price": soup.select_one('.price')?.get_text(strip=True),
        "rating": soup.select_one('[data-rating]')?.get('data-rating'),
        "images": [img['src'] for img in soup.select('.product-image img')]
    }

Handling Pagination

def scrape_all_pages(base_url, max_pages=10):
    all_data = []

    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}"
        response = requests.get(url)

        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.content, 'lxml')
        items = soup.find_all('div', class_='item')

        if not items:  # No more results
            break

        for item in items:
            all_data.append(item.get_text(strip=True))

        time.sleep(1)  # Be respectful

    return all_data

Selenium: Dynamic JavaScript Pages

pip install selenium webdriver-manager

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def setup_driver(headless=True):
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Chrome(
        service=webdriver.ChromeService(ChromeDriverManager().install()),
        options=options
    )
    return driver

def scrape_dynamic_page(url):
    driver = setup_driver()

    try:
        driver.get(url)

        # Wait for content to load
        wait = WebDriverWait(driver, 10)
        content = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'content'))
        )

        # Scroll to load lazy content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

        # Extract data
        items = driver.find_elements(By.CSS_SELECTOR, '.item')
        return [item.text for item in items]

    finally:
        driver.quit()

Saving Scraped Data

import pandas as pd

# Save to CSV
def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Saved {len(data)} records to {filename}")

# Save to JSON
def save_to_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=2)

Rate Limiting and Ethics

import time
import random

def polite_request(url, min_delay=1, max_delay=3):
    """Be a good web citizen"""
    time.sleep(random.uniform(min_delay, max_delay))

    response = requests.get(url, headers={
        "User-Agent": "My Research Bot/1.0 (contact@example.com)"
    })

    return response

Always:

Check robots.txt before scraping
Add delays between requests
Respect rate limits
Don't scrape personal data without permission

Use Cases That Pay

Price monitoring for e-commerce
Lead generation from directories
Competitive analysis for businesses
Real estate listings aggregation
Job board scraping for recruitment

DEV Community