Web Scraping for Machine Learning: Building Training Datasets

#python #tutorial #webdev #programming

Finding quality training data is ML's biggest challenge. Web scraping is essential for building custom datasets for text classification, image recognition, and sentiment analysis.

Planning Your Dataset

Before scraping, define your target variable, features needed, required volume, and class balance strategy.

Scraping Text for NLP

import requests
from bs4 import BeautifulSoup
import re

class ReviewScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'MLDataBot/1.0'})

    def scrape_reviews(self, url, selectors):
        resp = self.session.get(url, timeout=15)
        soup = BeautifulSoup(resp.text, 'html.parser')
        reviews = []
        for el in soup.select(selectors['container']):
            text = el.select_one(selectors['text'])
            rating = el.select_one(selectors['rating'])
            if text and rating:
                reviews.append({
                    'text': text.get_text(strip=True),
                    'rating': self._parse_rating(rating.get('class', []))
                })
        return reviews

    def _parse_rating(self, classes):
        for c in classes:
            m = re.search(r'(\d)', str(c))
            if m: return int(m.group(1))
        return None

Scraping Images for Computer Vision

import os, hashlib
from urllib.parse import urljoin

class ImageBuilder:
    def __init__(self, output='dataset'):
        self.output = output
        self.session = requests.Session()

    def scrape_images(self, url, category, selector='img'):
        cat_dir = os.path.join(self.output, category)
        os.makedirs(cat_dir, exist_ok=True)
        resp = self.session.get(url)
        soup = BeautifulSoup(resp.text, 'html.parser')
        for img in soup.select(selector):
            src = img.get('src') or img.get('data-src')
            if src:
                self._download(urljoin(url, src), cat_dir)

    def _download(self, url, directory):
        try:
            resp = self.session.get(url, timeout=10)
            if 'image' not in resp.headers.get('content-type', ''):
                return
            name = hashlib.md5(url.encode()).hexdigest()
            ext = url.split('.')[-1].split('?')[0][:4]
            with open(os.path.join(directory, f'{name}.{ext}'), 'wb') as f:
                f.write(resp.content)
        except Exception:
            pass

Data Cleaning Pipeline

import pandas as pd
from sklearn.model_selection import train_test_split

class DataCleaner:
    def clean(self, df):
        df = df.drop_duplicates(subset=['text'])
        df['clean'] = df['text'].apply(lambda t: re.sub(r'<[^>]+>', '', t))
        df['clean'] = df['clean'].apply(lambda t: re.sub(r'\s+', ' ', t).strip())
        df = df[df['clean'].str.len() > 20].dropna(subset=['clean', 'rating'])
        df['sentiment'] = df['rating'].apply(
            lambda r: 'positive' if r >= 4 else ('negative' if r <= 2 else 'neutral'))
        return df

    def split(self, df, test_size=0.2):
        train, test = train_test_split(df, test_size=test_size, stratify=df['sentiment'])
        train.to_csv('train.csv', index=False)
        test.to_csv('test.csv', index=False)
        return train, test

Scaling Your Scraping

When building large ML datasets, ScraperAPI handles proxy rotation and anti-bot detection. ThorData provides residential proxies for region-specific content, and ScrapeOps monitors your pipelines.

Conclusion

Web scraping bridges raw internet data and trained ML models. Focus on data quality over quantity, automate cleaning, and always validate before training.