Target Keyword: "web scraping ai claude vision"
Tags: web-scraping,ai,python,programming,developer
Type: Tutorial
Content
Web Scraping with AI: Extracting Data Using Claude Vision
Traditional web scraping breaks when sites use JavaScript rendering, CAPTCHAs, or complex layouts. AI-powered scraping with Claude Vision handles these cases elegantly. Here's how to build intelligent scrapers.
Why AI for Web Scraping?
Traditional scrapers:
- Brittle selectors that break with UI changes
- Can't handle JavaScript-rendered content
- Struggles with complex layouts and CAPTCHAs
AI scrapers:
- Understand page structure semantically
- Handle visual layouts like humans do
- Bypass many anti-bot measures
Screenshot + Claude Vision Pipeline
import asyncio
import httpx
from playwright.async_api import async_playwright
import base64
from typing import Optional
class AIScraper:
def __init__(self, api_key: str):
self.api_key = api_key
async def screenshot(self, url: str) -> bytes:
"""Take screenshot of a page."""
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto(url, wait_until="networkidle")
screenshot = await page.screenshot(full_page=True)
await browser.close()
return screenshot
async def extract_with_vision(self, screenshot: bytes, query: str) -> str:
"""Use Claude Vision to extract data from screenshot."""
image_b64 = base64.b64encode(screenshot).decode()
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.post(
"https://api.ofox.ai/v1/chat/completions",
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
},
json={
"model": "claude-3-5-sonnet-20241022",
"messages": [{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_b64}"
}
},
{
"type": "text",
"text": f"Extract the following data from this webpage screenshot: {query}. Return the data in a structured format."
}
]
}],
"max_tokens": 4096
}
)
data = response.json()
return data["choices"][0]["message"]["content"]
async def scrape(self, url: str, query: str) -> str:
"""Full pipeline: screenshot + extract."""
screenshot = await self.screenshot(url)
return await self.extract_with_vision(screenshot, query)
Structured Data Extraction
async def extract_job_listings(scraper: AIScraper, url: str) -> list[dict]:
"""Extract job listings from a job board."""
query = """
Extract all job listings from this page. For each job, return:
- Job title
- Company name
- Location
- Salary range (if visible)
- Posted date
- Job URL
Format as a JSON array of objects.
"""
result = await scraper.scrape(url, query)
# Parse JSON from response
import json
import re
json_match = re.search(r'\[.*\]', result, re.DOTALL)
if json_match:
return json.loads(json_match.group())
return []
# Usage
jobs = await extract_job_listings(scraper, "https://remoteok.com/remote-python-jobs")
for job in jobs:
print(f"{job['title']} at {job['company']}")
Handling Pagination
async def scrape_all_pages(scraper: AIScraper, base_url: str, query: str, max_pages: int = 10):
"""Scrape multiple pages of a paginated site."""
all_results = []
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
print(f"Scraping page {page}...")
try:
result = await scraper.scrape(url, query)
if not result or "no results" in result.lower():
break
all_results.append(result)
await asyncio.sleep(2) # Be respectful
except Exception as e:
print(f"Error on page {page}: {e}")
break
return all_results
Real-World Examples
Example 1: Real Estate Listings
async def scrape_real_estate(scraper: AIScraper, city: str):
url = f"https://www.zillow.com/homes/{city.replace(' ', '_')}_rb/"
query = """
Extract all property listings. For each property:
- Address
- Price
- Number of bedrooms/bathrooms
- Square footage
- Property type (house, condo, etc.)
"""
return await scraper.scrape(url, query)
Example 2: Product Price Monitoring
async def monitor_prices(scraper: AIScraper, urls: list[str]) -> list[dict]:
"""Monitor prices across multiple product pages."""
results = []
for url in urls:
query = """
Extract:
- Product name
- Current price
- Original price (if discounted)
- Discount percentage
- Availability status
"""
result = await scraper.scrape(url, query)
results.append({"url": url, "data": result})
await asyncio.sleep(3)
return results
Anti-Bot Evasion
async def stealth_screenshot(url: str) -> bytes:
"""Take screenshot while evading bot detection."""
async with async_playwright() as p:
context = await p.chromium.launch_context()
# Set realistic viewport
await context.new_page().set_viewport_size({"width": 1920, "height": 1080})
# Add stealth script
await context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
""")
page = await context.new_page()
# Random delay before actions
await page.wait_for_timeout(random.uniform(1000, 3000))
await page.goto(url, wait_until="networkidle")
await page.wait_for_timeout(random.uniform(1000, 2000))
screenshot = await page.screenshot(full_page=True)
await context.close()
return screenshot
Ethical Scraping Guidelines
- Respect robots.txt — Check before scraping
- Rate limit — Add delays between requests
- User-Agent — Identify your bot honestly
- Cache results — Don't re-scrape the same page
- Terms of Service — Don't violate site terms
Getting Started
Build AI-powered scrapers with ofox.ai — their Vision-capable API makes intelligent data extraction simple and reliable.
This article contains affiliate links.
Tags: web-scraping,ai,python,programming,developer
Canonical URL: https://dev.to/zny10289
Top comments (0)