How to Scrape Booking Engines: Hotels, Flights, and Car Rentals

#python #webdev #tutorial #programming

Travel booking data is among the most valuable — and hardest to scrape — on the web. Booking.com, Expedia, and Kayak use aggressive anti-bot measures, dynamic pricing, and JavaScript-heavy interfaces. Here's how to extract travel pricing data reliably.

Why Travel Data Is Challenging

Heavy JavaScript rendering (React/Next.js frontends)
CAPTCHAs and bot detection (Akamai, PerimeterX, DataDome)
Session-based pricing (cookies influence displayed prices)
Dynamic content loaded via XHR/API calls
Geo-dependent pricing (different prices by location)

Setting Up Robust Scraping

# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).

ScraperAPI handles JavaScript rendering and anti-bot bypasses that travel sites rely on.

Scraping Hotel Prices

import re

def scrape_hotel_prices(self, city, checkin, checkout, guests=2):
    checkin_str = checkin.strftime("%Y-%m-%d")
    checkout_str = checkout.strftime("%Y-%m-%d")
    url = (
        f"https://www.booking.com/searchresults.html"
        f"?ss={city}&checkin={checkin_str}&checkout={checkout_str}"
        f"&group_adults={guests}&no_rooms=1&order=price"
    )
    soup = self.scrape(url)
    hotels = []
    for card in soup.find_all("div", {"data-testid": "property-card"}):
        name_el = card.find("div", {"data-testid": "title"})
        price_el = card.find("span", {"data-testid": "price-and-discounted-price"})
        rating_el = card.find("div", {"data-testid": "review-score"})
        if name_el and price_el:
            hotels.append({
                "name": name_el.get_text(strip=True),
                "price_per_night": parse_price(price_el.get_text(strip=True)),
                "rating": rating_el.get_text(strip=True) if rating_el else None,
                "city": city,
            })
    return hotels

def parse_price(text):
    nums = re.findall(r"[\d,]+", text.replace(",", ""))
    return float(nums[0]) if nums else None

Scraping Flight Prices

def scrape_flights(self, origin, dest, date):
    date_str = date.strftime("%Y-%m-%d")
    url = f"https://www.kayak.com/flights/{origin}-{dest}/{date_str}?sort=bestflight_a"
    soup = self.scrape(url)
    flights = []
    for result in soup.find_all("div", class_=lambda c: c and "resultInner" in c):
        price_el = result.find("span", class_=lambda c: c and "price" in str(c).lower())
        airline_el = result.find("span", class_=lambda c: c and "airline" in str(c).lower())
        if price_el:
            flights.append({
                "price": parse_price(price_el.get_text(strip=True)),
                "airline": airline_el.get_text(strip=True) if airline_el else "Unknown",
                "route": f"{origin} -> {dest}",
                "date": date_str,
            })
    return flights

Car Rental Price Comparison

def scrape_car_rentals(self, location, pickup_date, return_date):
    pickup_str = pickup_date.strftime("%Y-%m-%d")
    return_str = return_date.strftime("%Y-%m-%d")
    url = f"https://www.kayak.com/cars/{location}/{pickup_str}/{return_str}"
    soup = self.scrape(url)
    rentals = []
    for card in soup.find_all("div", class_=lambda c: c and "resultCard" in str(c)):
        company_el = card.find("span", class_=lambda c: c and "company" in str(c).lower())
        price_el = card.find("span", class_=lambda c: c and "price" in str(c).lower())
        if price_el:
            rentals.append({
                "company": company_el.get_text(strip=True) if company_el else "Unknown",
                "total_price": parse_price(price_el.get_text(strip=True)),
                "location": location,
            })
    return rentals

Geo-Pricing Detection

Travel prices vary by apparent location. Test with geo-targeted proxies via ThorData:

def detect_geo_pricing(scraper, url, countries):
    prices_by_country = {}
    for country in countries:
        soup = scraper.scrape(url, country=country)
        price_el = soup.select_one("[data-testid='price-and-discounted-price']")
        if price_el:
            prices_by_country[country] = parse_price(price_el.get_text())
        time.sleep(3)
    return prices_by_country

geo_prices = detect_geo_pricing(
    TravelScraper(API_KEY),
    "https://www.booking.com/hotel/fr/example.html",
    ["us", "uk", "de", "in", "br"]
)
for country, price in geo_prices.items():
    print(f"{country.upper()}: ${price}")

Price Tracking Over Time

import sqlite3

def track_travel_prices(db_path="travel_prices.db"):
    conn = sqlite3.connect(db_path)
    conn.execute('''
        CREATE TABLE IF NOT EXISTS travel_prices (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            category TEXT, name TEXT, price REAL,
            route TEXT, travel_date TEXT, scraped_at TEXT
        )
    ''')
    scraper = TravelScraper(API_KEY)
    checkin = datetime.now() + timedelta(days=30)
    checkout = checkin + timedelta(days=3)
    hotels = scraper.scrape_hotel_prices("Paris", checkin, checkout)
    for h in hotels[:5]:
        conn.execute(
            "INSERT INTO travel_prices VALUES (NULL, ?, ?, ?, ?, ?, ?)",
            ("hotel", h["name"], h["price_per_night"], h["city"],
             str(checkin.date()), datetime.utcnow().isoformat())
        )
    conn.commit()

track_travel_prices()

Monitor scraper reliability with ScrapeOps — booking engines change layouts frequently.

Travel data scraping requires robust infrastructure, but the payoff is enormous. Price comparison tools, fare alerts, and travel analytics all depend on reliable extraction from booking engines.

Happy scraping!