DEV Community

Brad
Brad

Posted on

Python Web Scraper: Extract Data from Any Website in 50 Lines

Python Web Scraper: Extract Data from Any Website in 50 Lines

Web scraping is one of the most practical Python skills. Here's a production-ready scraper that handles most websites.

Setup

pip install requests beautifulsoup4 lxml
Enter fullscreen mode Exit fullscreen mode

Core Scraper

import requests
from bs4 import BeautifulSoup
import json
import time

def scrape_page(url: str) -> BeautifulSoup:
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    response = requests.get(url, headers=headers, timeout=15)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'lxml')

# Example: Scrape Hacker News front page
def scrape_hackernews():
    soup = scrape_page('https://news.ycombinator.com')
    stories = []

    for item in soup.select('.athing'):
        title_el = item.select_one('.titleline > a')
        if title_el:
            stories.append({
                'title': title_el.text,
                'url': title_el.get('href', ''),
            })

    return stories

stories = scrape_hackernews()
for story in stories[:5]:
    print(f"- {story['title'][:60]}")
Enter fullscreen mode Exit fullscreen mode

Handle Pagination

def scrape_all_pages(base_url: str, max_pages: int = 10):
    results = []

    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}"
        soup = scrape_page(url)

        items = soup.select('.item')
        if not items:
            break

        for item in items:
            results.append({
                'title': item.select_one('h2').text.strip(),
                'link': item.select_one('a')['href']
            })

        time.sleep(1)  # Be polite

    return results
Enter fullscreen mode Exit fullscreen mode

Save to CSV

import csv

def save_to_csv(data: list, filename: str):
    if not data:
        return
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)
    print(f"Saved {len(data)} rows to {filename}")

results = scrape_all_pages('https://example.com/products')
save_to_csv(results, 'products.csv')
Enter fullscreen mode Exit fullscreen mode

For JavaScript-Heavy Sites

from playwright.sync_api import sync_playwright

def scrape_js_site(url: str):
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        page.goto(url, wait_until='networkidle')
        content = page.content()
        browser.close()
        return BeautifulSoup(content, 'lxml')
Enter fullscreen mode Exit fullscreen mode

Retry Logic

def scrape_with_retry(url: str, max_retries: int = 3):
    for attempt in range(max_retries):
        try:
            return scrape_page(url)
        except requests.RequestException as e:
            if attempt == max_retries - 1:
                raise
            print(f"Retry {attempt + 1}: {e}")
            time.sleep(2 ** attempt)  # Exponential backoff
Enter fullscreen mode Exit fullscreen mode

Want More Scripts?

Python Automation Toolkit → — 25+ scripts including web scrapers, file organizers, email automators. $9 one-time.

Top comments (0)