DEV.TO ARTICLE 45: Web Scraping with AI: Extracting Data Using Claude Vision

Target Keyword: "web scraping ai claude vision"
Tags: web-scraping,ai,python,programming,developer
Type: Tutorial

Content

Web Scraping with AI: Extracting Data Using Claude Vision

Traditional web scraping breaks when sites use JavaScript rendering, CAPTCHAs, or complex layouts. AI-powered scraping with Claude Vision handles these cases elegantly. Here's how to build intelligent scrapers.

Why AI for Web Scraping?

Traditional scrapers:

Brittle selectors that break with UI changes
Can't handle JavaScript-rendered content
Struggles with complex layouts and CAPTCHAs

AI scrapers:

Understand page structure semantically
Handle visual layouts like humans do
Bypass many anti-bot measures

Screenshot + Claude Vision Pipeline

import asyncio
import httpx
from playwright.async_api import async_playwright
import base64
from typing import Optional

class AIScraper:
    def __init__(self, api_key: str):
        self.api_key = api_key

    async def screenshot(self, url: str) -> bytes:
        """Take screenshot of a page."""
        async with async_playwright() as p:
            browser = await p.chromium.launch()
            page = await browser.new_page()
            await page.goto(url, wait_until="networkidle")
            screenshot = await page.screenshot(full_page=True)
            await browser.close()
            return screenshot

    async def extract_with_vision(self, screenshot: bytes, query: str) -> str:
        """Use Claude Vision to extract data from screenshot."""
        image_b64 = base64.b64encode(screenshot).decode()

        async with httpx.AsyncClient(timeout=60.0) as client:
            response = await client.post(
                "https://api.ofox.ai/v1/chat/completions",
                headers={
                    "Authorization": f"Bearer {self.api_key}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": "claude-3-5-sonnet-20241022",
                    "messages": [{
                        "role": "user",
                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{image_b64}"
                                }
                            },
                            {
                                "type": "text",
                                "text": f"Extract the following data from this webpage screenshot: {query}. Return the data in a structured format."
                            }
                        ]
                    }],
                    "max_tokens": 4096
                }
            )

            data = response.json()
            return data["choices"][0]["message"]["content"]

    async def scrape(self, url: str, query: str) -> str:
        """Full pipeline: screenshot + extract."""
        screenshot = await self.screenshot(url)
        return await self.extract_with_vision(screenshot, query)

Structured Data Extraction

async def extract_job_listings(scraper: AIScraper, url: str) -> list[dict]:
    """Extract job listings from a job board."""

    query = """
    Extract all job listings from this page. For each job, return:
    - Job title
    - Company name
    - Location
    - Salary range (if visible)
    - Posted date
    - Job URL

    Format as a JSON array of objects.
    """

    result = await scraper.scrape(url, query)

    # Parse JSON from response
    import json
    import re

    json_match = re.search(r'\[.*\]', result, re.DOTALL)
    if json_match:
        return json.loads(json_match.group())
    return []

# Usage
jobs = await extract_job_listings(scraper, "https://remoteok.com/remote-python-jobs")
for job in jobs:
    print(f"{job['title']} at {job['company']}")

Handling Pagination

async def scrape_all_pages(scraper: AIScraper, base_url: str, query: str, max_pages: int = 10):
    """Scrape multiple pages of a paginated site."""
    all_results = []

    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}"
        print(f"Scraping page {page}...")

        try:
            result = await scraper.scrape(url, query)
            if not result or "no results" in result.lower():
                break
            all_results.append(result)
            await asyncio.sleep(2)  # Be respectful
        except Exception as e:
            print(f"Error on page {page}: {e}")
            break

    return all_results

Real-World Examples

Example 1: Real Estate Listings

async def scrape_real_estate(scraper: AIScraper, city: str):
    url = f"https://www.zillow.com/homes/{city.replace(' ', '_')}_rb/"

    query = """
    Extract all property listings. For each property:
    - Address
    - Price
    - Number of bedrooms/bathrooms
    - Square footage
    - Property type (house, condo, etc.)
    """

    return await scraper.scrape(url, query)

Example 2: Product Price Monitoring

async def monitor_prices(scraper: AIScraper, urls: list[str]) -> list[dict]:
    """Monitor prices across multiple product pages."""

    results = []
    for url in urls:
        query = """
        Extract:
        - Product name
        - Current price
        - Original price (if discounted)
        - Discount percentage
        - Availability status
        """

        result = await scraper.scrape(url, query)
        results.append({"url": url, "data": result})
        await asyncio.sleep(3)

    return results

Anti-Bot Evasion

async def stealth_screenshot(url: str) -> bytes:
    """Take screenshot while evading bot detection."""
    async with async_playwright() as p:
        context = await p.chromium.launch_context()

        # Set realistic viewport
        await context.new_page().set_viewport_size({"width": 1920, "height": 1080})

        # Add stealth script
        await context.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
        """)

        page = await context.new_page()

        # Random delay before actions
        await page.wait_for_timeout(random.uniform(1000, 3000))

        await page.goto(url, wait_until="networkidle")
        await page.wait_for_timeout(random.uniform(1000, 2000))

        screenshot = await page.screenshot(full_page=True)
        await context.close()

        return screenshot