Python Web Scraping: Extract Data from Any Website in 2024
Web scraping is one of the most useful Python skills. Here is how to do it right.
Basic Scraping with BeautifulSoup
import requests
from bs4 import BeautifulSoup
def scrape_page(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, "html.parser")
return soup
Extract Common Data
# Get all links
links = [a['href'] for a in soup.find_all('a', href=True)]
# Get text by CSS selector
titles = [el.text.strip() for el in soup.select('h2.article-title')]
# Get table data
rows = []
for tr in soup.select('table tr'):
row = [td.text.strip() for td in tr.find_all('td')]
if row:
rows.append(row)
Handle JavaScript Sites with Playwright
from playwright.sync_api import sync_playwright
def scrape_js_site(url):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
page.wait_for_selector('.product-list')
products = page.query_selector_all('.product')
data = [{'name': p.inner_text()} for p in products]
browser.close()
return data
Handle Pagination
import time
def scrape_all_pages(base_url, max_pages=10):
all_data = []
for page_num in range(1, max_pages + 1):
url = base_url + "?page=" + str(page_num)
soup = scrape_page(url)
items = soup.select('.item')
if not items:
break
all_data.extend([el.text.strip() for el in items])
time.sleep(1)
return all_data
Save to CSV
import csv
def save_to_csv(data, filename):
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
Best Practices
- Respect robots.txt - check before scraping
- Add delays - 1-2 seconds between requests
- Use headers - mimic real browser
- Handle errors - retry on timeouts, skip on 404s
Want 50+ ready-to-use Python automation scripts? Get the complete toolkit for just $9: https://lukassbrad.gumroad.com/l/ugeka
Top comments (0)