Finding quality training data is ML's biggest challenge. Web scraping is essential for building custom datasets for text classification, image recognition, and sentiment analysis.
Planning Your Dataset
Before scraping, define your target variable, features needed, required volume, and class balance strategy.
Scraping Text for NLP
import requests
from bs4 import BeautifulSoup
import re
class ReviewScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({'User-Agent': 'MLDataBot/1.0'})
def scrape_reviews(self, url, selectors):
resp = self.session.get(url, timeout=15)
soup = BeautifulSoup(resp.text, 'html.parser')
reviews = []
for el in soup.select(selectors['container']):
text = el.select_one(selectors['text'])
rating = el.select_one(selectors['rating'])
if text and rating:
reviews.append({
'text': text.get_text(strip=True),
'rating': self._parse_rating(rating.get('class', []))
})
return reviews
def _parse_rating(self, classes):
for c in classes:
m = re.search(r'(\d)', str(c))
if m: return int(m.group(1))
return None
Scraping Images for Computer Vision
import os, hashlib
from urllib.parse import urljoin
class ImageBuilder:
def __init__(self, output='dataset'):
self.output = output
self.session = requests.Session()
def scrape_images(self, url, category, selector='img'):
cat_dir = os.path.join(self.output, category)
os.makedirs(cat_dir, exist_ok=True)
resp = self.session.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
for img in soup.select(selector):
src = img.get('src') or img.get('data-src')
if src:
self._download(urljoin(url, src), cat_dir)
def _download(self, url, directory):
try:
resp = self.session.get(url, timeout=10)
if 'image' not in resp.headers.get('content-type', ''):
return
name = hashlib.md5(url.encode()).hexdigest()
ext = url.split('.')[-1].split('?')[0][:4]
with open(os.path.join(directory, f'{name}.{ext}'), 'wb') as f:
f.write(resp.content)
except Exception:
pass
Data Cleaning Pipeline
import pandas as pd
from sklearn.model_selection import train_test_split
class DataCleaner:
def clean(self, df):
df = df.drop_duplicates(subset=['text'])
df['clean'] = df['text'].apply(lambda t: re.sub(r'<[^>]+>', '', t))
df['clean'] = df['clean'].apply(lambda t: re.sub(r'\s+', ' ', t).strip())
df = df[df['clean'].str.len() > 20].dropna(subset=['clean', 'rating'])
df['sentiment'] = df['rating'].apply(
lambda r: 'positive' if r >= 4 else ('negative' if r <= 2 else 'neutral'))
return df
def split(self, df, test_size=0.2):
train, test = train_test_split(df, test_size=test_size, stratify=df['sentiment'])
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
return train, test
Scaling Your Scraping
When building large ML datasets, ScraperAPI handles proxy rotation and anti-bot detection. ThorData provides residential proxies for region-specific content, and ScrapeOps monitors your pipelines.
Conclusion
Web scraping bridges raw internet data and trained ML models. Focus on data quality over quantity, automate cleaning, and always validate before training.
Top comments (0)