Using GPT-4 and Claude to Extract Structured Data From Any Webpage in 2026

#ai #python #webscraping #llm

Using GPT-4 and Claude to Extract Structured Data From Any Webpage in 2026

Traditional web scraping breaks when sites change their HTML structure. LLM-based extraction doesn't — you describe what you want in plain English, and the model finds it regardless of how the page is structured.

Here's when this approach beats traditional scraping, and the complete implementation.

The Core Idea

Traditional scraping:

price = soup.find('span', class_='product-price').text  # Breaks if class changes

LLM extraction:

price = llm_extract("What is the product price on this page?", page_html)
# Works even if the structure changes completely

The trade-off: LLM extraction costs money and is slower. Traditional scraping is free and fast. Use LLMs when:

Structure changes frequently (news sites, e-commerce with AB testing)
You're scraping many different sites and can't maintain per-site parsers
You need semantic understanding (sentiment, summaries, classifications)
The data is in tables, PDFs, images, or other unstructured formats

Method 1: Direct HTML → Structured JSON with GPT-4o-mini

from openai import OpenAI
from bs4 import BeautifulSoup
import requests, json

client = OpenAI()  # Uses OPENAI_API_KEY env var

def extract_with_gpt(url: str, schema: dict) -> dict:
    """
    Extract structured data from a webpage using GPT-4o-mini.

    schema: dict describing what to extract
    Example: {"product_name": "str", "price": "float", "rating": "float", "reviews_count": "int"}
    """
    # Get the page HTML
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"})

    # Clean HTML to reduce token count (remove scripts, styles, comments)
    soup = BeautifulSoup(r.text, 'html.parser')
    for tag in soup.find_all(['script', 'style', 'noscript', 'meta', 'link']):
        tag.decompose()

    # Get clean text (much cheaper than full HTML)
    clean_text = soup.get_text(separator='\n', strip=True)

    # Truncate to fit context window (keep relevant parts)
    max_chars = 12000  # ~3000 tokens
    if len(clean_text) > max_chars:
        clean_text = clean_text[:max_chars] + "\n...[truncated]"

    schema_str = json.dumps(schema, indent=2)

    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Cheapest option, ~$0.0002 per page
        messages=[
            {
                "role": "system",
                "content": f"""Extract data from the webpage text and return ONLY a JSON object matching this schema:
{schema_str}

Rules:
- Return ONLY valid JSON, no other text
- Use null for missing fields
- Convert prices to numbers (remove currency symbols)
- If a field isn't found, return null"""
            },
            {
                "role": "user", 
                "content": f"Extract data from this webpage:\n\n{clean_text}"
            }
        ],
        temperature=0  # Deterministic output
    )

    result_text = response.choices[0].message.content

    try:
        return json.loads(result_text)
    except json.JSONDecodeError:
        # Sometimes models add markdown code blocks
        import re
        match = re.search(r'```

(?:json)?\s*(.*?)\s*

```', result_text, re.DOTALL)
        if match:
            return json.loads(match.group(1))
        raise

# Usage
product_data = extract_with_gpt(
    "https://www.amazon.com/dp/B09G9FPHY6",
    schema={
        "product_name": "str",
        "price": "float",
        "rating": "float", 
        "reviews_count": "int",
        "availability": "str",
        "brand": "str"
    }
)

print(json.dumps(product_data, indent=2))
# {
#   "product_name": "Echo Dot (5th Gen)",
#   "price": 49.99,
#   "rating": 4.7,
#   "reviews_count": 123456,
#   "availability": "In Stock",
#   "brand": "Amazon"
# }

Cost estimate: GPT-4o-mini at ~3000 tokens/page = ~$0.0002/page = $0.20 per 1000 pages.

Method 2: Structured Output with Pydantic Validation

Force the LLM to return valid structured data using Pydantic:

from openai import OpenAI
from pydantic import BaseModel, Field
from typing import Optional, List
import requests
from bs4 import BeautifulSoup

client = OpenAI()

class ProductReview(BaseModel):
    rating: float = Field(ge=1, le=5)
    text: str
    author: str
    date: Optional[str]
    verified_purchase: bool = False

class ProductData(BaseModel):
    name: str
    price: Optional[float]
    currency: str = "USD"
    rating: Optional[float] = Field(None, ge=0, le=5)
    review_count: Optional[int]
    availability: Optional[str]
    features: List[str] = []
    top_reviews: List[ProductReview] = []

def extract_product_structured(url: str) -> ProductData:
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, 'html.parser')
    for tag in soup.find_all(['script', 'style']):
        tag.decompose()
    text = soup.get_text(separator='\n', strip=True)[:12000]

    # Use OpenAI's structured output feature
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Extract product information from this webpage. Include top 3 reviews if available."},
            {"role": "user", "content": text}
        ],
        response_format=ProductData
    )

    return response.choices[0].message.parsed

product = extract_product_structured("https://www.amazon.com/dp/...")
print(f"Product: {product.name}")
print(f"Price: ${product.price}")
print(f"Rating: {product.rating}/5 ({product.review_count} reviews)")

Method 3: Claude for Long Documents

Claude handles larger contexts better — useful for long articles, reports, or multi-page PDFs:

import anthropic
import requests
from bs4 import BeautifulSoup
import json

client = anthropic.Anthropic()  # Uses ANTHROPIC_API_KEY env var

def extract_with_claude(url: str, extraction_prompt: str) -> str:
    """
    Extract information from a webpage using Claude.
    Better for: long pages, nuanced extraction, natural language output.
    """
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, 'html.parser')
    for tag in soup.find_all(['script', 'style', 'noscript']):
        tag.decompose()

    # Claude can handle up to 200K tokens — much more headroom
    text = soup.get_text(separator='\n', strip=True)[:50000]

    message = client.messages.create(
        model="claude-haiku-3-5",  # Cheapest Claude model
        max_tokens=2000,
        messages=[{
            "role": "user",
            "content": f"{extraction_prompt}\n\nWebpage content:\n{text}"
        }]
    )

    return message.content[0].text

# Example: Extract pricing table from a SaaS pricing page
pricing = extract_with_claude(
    "https://example-saas.com/pricing",
    """Extract the pricing information and return it as a JSON array.
    Each plan should have: name, monthly_price, annual_price, features (list).
    Return ONLY the JSON, no other text."""
)
print(json.loads(pricing))

Method 4: Async Batch Extraction (Production Scale)

For scraping 100+ pages cost-effectively:

import asyncio
import aiohttp
from openai import AsyncOpenAI
from bs4 import BeautifulSoup
import json
from typing import List, Dict

client = AsyncOpenAI()

async def fetch_page(session: aiohttp.ClientSession, url: str) -> str:
    """Fetch a page asynchronously."""
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}
    async with session.get(url, headers=headers) as response:
        html = await response.text()
        soup = BeautifulSoup(html, 'html.parser')
        for tag in soup.find_all(['script', 'style']):
            tag.decompose()
        return soup.get_text(separator='\n', strip=True)[:8000]

async def extract_from_page(text: str, schema: dict) -> dict:
    """Extract structured data using GPT-4o-mini."""
    response = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"Return ONLY a JSON object matching: {json.dumps(schema)}"},
            {"role": "user", "content": text}
        ],
        temperature=0
    )
    try:
        return json.loads(response.choices[0].message.content)
    except:
        return {}

async def batch_extract(urls: List[str], schema: dict, concurrency: int = 5) -> List[Dict]:
    """Extract data from many URLs concurrently."""
    semaphore = asyncio.Semaphore(concurrency)

    async def process_url(session, url):
        async with semaphore:
            text = await fetch_page(session, url)
            result = await extract_from_page(text, schema)
            result['_url'] = url
            return result

    async with aiohttp.ClientSession() as session:
        tasks = [process_url(session, url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)

    return [r for r in results if isinstance(r, dict)]

# Usage: extract from 100 product pages
urls = ["https://shop.example.com/product/1", "https://shop.example.com/product/2", ...]
schema = {"name": "str", "price": "float", "in_stock": "bool"}

results = asyncio.run(batch_extract(urls, schema, concurrency=5))
print(f"Extracted {len(results)} products")

# Cost for 100 pages at ~$0.0002/page = $0.02

Hybrid Approach: Fast + Accurate

Use CSS selectors first, fall back to LLM only when they fail:

import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import json

client = OpenAI()

SELECTORS = {
    'amazon': {
        'name': '#productTitle',
        'price': '.a-price-whole',
        'rating': '.a-icon-alt',
    },
    'generic': None  # Fall back to LLM
}

def extract_price_hybrid(url: str) -> dict:
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, 'html.parser')

    # Try fast CSS selector first
    if 'amazon.com' in url:
        selectors = SELECTORS['amazon']
        result = {}
        all_found = True

        for field, selector in selectors.items():
            elem = soup.select_one(selector)
            if elem:
                result[field] = elem.text.strip()
            else:
                all_found = False
                break

        if all_found:
            return {'method': 'css', 'data': result}

    # Fallback to LLM
    for tag in soup.find_all(['script', 'style']):
        tag.decompose()
    text = soup.get_text(separator='\n', strip=True)[:10000]

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": 'Return ONLY JSON: {"name": "str", "price": "float", "rating": "float"}'},
            {"role": "user", "content": text}
        ],
        temperature=0
    )

    try:
        return {'method': 'llm', 'data': json.loads(response.choices[0].message.content)}
    except:
        return {'method': 'failed', 'data': {}}

result = extract_price_hybrid("https://amazon.com/dp/...")
print(f"Method: {result['method']}, Data: {result['data']}")

Cost Comparison by Volume

Volume	CSS-only	GPT-4o-mini	GPT-4o	Claude Haiku
100 pages	$0	$0.02	$0.50	$0.03
1,000 pages	$0	$0.20	$5.00	$0.30
10,000 pages	$0	$2.00	$50.00	$3.00
100,000 pages	$0	$20.00	$500.00	$30.00

Practical rule: Use CSS selectors for sites you control or know well. Use LLMs for:

Unknown site structures (one-off extractions)
Semantic extraction (summaries, classifications)
Fallback when selectors break

Python Web Scraping Tutorial for Beginners 2026 — Traditional scraping foundation
Web Scraping Tools Comparison 2026 — When to use which tool
How to Validate LLM Outputs in Production — Validating extracted data

Want ready-to-use web scraping tools without the setup hassle?

The $29 Apify Scrapers Bundle includes 35+ production-ready scrapers — Google SERP, LinkedIn, Amazon, TikTok Shop, contact info, and more. One-time payment, instant download.

👉 Get the Bundle ($29)