How to Scrape Glassdoor Reviews and Salaries Without Getting Blocked in 2026
Glassdoor has strong anti-bot protection — login walls, Cloudflare, and aggressive bot detection. But the data (company reviews, salary ranges, interview questions) is publicly visible. Here's what actually works in 2026.
What You Can Access
Without login (limited):
- Company overview: name, rating, industry, size, HQ
- Snippet of recent reviews (3-5 visible before login wall)
- Salary ranges (partial — "Software Engineer: $100K-$140K")
- Interview questions (partial)
With session cookies (full access):
- All reviews with full text, ratings, pros/cons
- Complete salary data with location and level
- Full interview experience reports
- Job listings with detailed descriptions
Method 1: Public API / JSON Endpoints
Glassdoor's web app fetches data via internal API calls. Intercept these and you skip the HTML scraping entirely:
from curl_cffi import requests as cf_requests
import json
def get_glassdoor_company_data(company_id: int) -> dict:
"""
Fetch company data from Glassdoor's internal API.
company_id: numeric ID found in the URL (e.g., glassdoor.com/Overview/EI_IE671.htm -> 671)
"""
session = cf_requests.Session()
# Warm up session on homepage
session.get("https://www.glassdoor.com/", impersonate="chrome124",
headers={"Accept-Language": "en-US,en;q=0.9"})
import time, random
time.sleep(random.uniform(1.5, 3))
# The internal API endpoint
api_url = f"https://www.glassdoor.com/api/employer/{company_id}/overviewReviewsV2.htm"
headers = {
"Referer": f"https://www.glassdoor.com/Overview/Working-at-EI_IE{company_id}.htm",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
}
r = session.get(api_url, impersonate="chrome124", headers=headers)
if r.status_code == 200:
return r.json()
return {}
# Find company ID from URL: glassdoor.com/Overview/Working-at-Google-EI_IE9079.htm → 9079
google_data = get_glassdoor_company_data(9079)
print(json.dumps(google_data, indent=2)[:500])
Method 2: Structured Data from Review Pages
Glassdoor includes schema.org structured data on company pages:
from curl_cffi import requests as cf_requests
from bs4 import BeautifulSoup
import json, re, time
def scrape_glassdoor_overview(company_slug: str) -> dict:
"""
Scrape company overview from Glassdoor.
company_slug: e.g., 'Working-at-Google-EI_IE9079'
"""
session = cf_requests.Session()
url = f"https://www.glassdoor.com/Overview/{company_slug}.htm"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Referer": "https://www.glassdoor.com/",
}
r = session.get(url, impersonate="chrome124", headers=headers)
if r.status_code != 200:
print(f"Got {r.status_code}")
return {}
soup = BeautifulSoup(r.text, 'html.parser')
result = {}
# Extract schema.org structured data
for script in soup.find_all('script', type='application/ld+json'):
try:
data = json.loads(script.string)
if data.get('@type') == 'Organization':
result.update({
'name': data.get('name', ''),
'url': data.get('url', ''),
'description': data.get('description', '')[:300],
'employee_count': data.get('numberOfEmployees', {}).get('value', ''),
'rating': data.get('aggregateRating', {}).get('ratingValue', ''),
'review_count': data.get('aggregateRating', {}).get('reviewCount', ''),
})
except:
continue
# Extract visible rating from HTML
rating_elem = soup.select_one('[data-test="rating"]')
if rating_elem:
result['rating'] = rating_elem.text.strip()
# Extract industry and size
for item in soup.select('[data-label]'):
label = item.get('data-label', '')
value = item.text.strip()
if label in ('Industry', 'Size', 'Founded', 'Revenue', 'Type'):
result[label.lower()] = value
return result
# Usage
data = scrape_glassdoor_overview("Working-at-Google-EI_IE9079")
print(json.dumps(data, indent=2))
Method 3: Glassdoor with Session Cookies (Full Access)
For access to full reviews (past the login wall):
from playwright.sync_api import sync_playwright
import json, time
# Export cookies from your Glassdoor browser session using Cookie Editor extension
# Save to /tmp/glassdoor_cookies.json
def scrape_glassdoor_reviews_with_session(company_url: str, cookies_path: str) -> list:
"""
Scrape Glassdoor reviews using saved session cookies.
Requires: logged-in Glassdoor account cookies exported as JSON.
"""
with open(cookies_path) as f:
cookies_data = json.load(f)
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
viewport={"width": 1280, "height": 800}
)
# Load cookies
playwright_cookies = [
{'name': c['name'], 'value': c['value'], 'domain': '.glassdoor.com', 'path': '/'}
for c in cookies_data
]
context.add_cookies(playwright_cookies)
page = context.new_page()
# Add stealth patches
context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
""")
page.goto(company_url + "Reviews/", wait_until="networkidle")
time.sleep(2)
reviews = []
# Extract reviews from page
review_items = page.locator('[data-test="review"]').all()
for review in review_items:
try:
reviews.append({
'title': review.locator('[data-test="review-title"]').text_content() or '',
'rating': review.locator('[data-test="review-rating"]').get_attribute('aria-label') or '',
'pros': review.locator('[data-test="review-pros"]').text_content() or '',
'cons': review.locator('[data-test="review-cons"]').text_content() or '',
'date': review.locator('[data-test="review-date"]').text_content() or '',
'position': review.locator('[data-test="reviewer-role"]').text_content() or '',
})
except:
continue
browser.close()
return reviews
Method 4: Scraping Salary Data
Glassdoor's salary data is particularly valuable for compensation benchmarking:
from curl_cffi import requests as cf_requests
from bs4 import BeautifulSoup
import json, re
def get_salary_data(job_title: str, location: str = "United States") -> list:
"""
Extract salary data from Glassdoor's salary pages.
"""
session = cf_requests.Session()
# First: warm up on homepage
session.get("https://www.glassdoor.com/", impersonate="chrome124")
import time
time.sleep(2)
# Search URL format
slug = job_title.lower().replace(' ', '-')
location_slug = location.lower().replace(' ', '-').replace(',', '')
url = f"https://www.glassdoor.com/Salaries/{location_slug}-{slug}-salary-SRCH_IL.0,{len(location_slug)}_IN1_KO{len(location_slug)+1},{len(location_slug)+1+len(job_title)}.htm"
headers = {
"Referer": "https://www.glassdoor.com/Salaries/",
"Accept-Language": "en-US,en;q=0.9",
}
r = session.get(url, impersonate="chrome124", headers=headers)
if r.status_code != 200:
return []
soup = BeautifulSoup(r.text, 'html.parser')
salaries = []
# Extract from structured data in page scripts
for script in soup.find_all('script'):
if script.string and 'totalPayMedian' in (script.string or ''):
try:
# Extract JSON from script
match = re.search(r'\{.*totalPayMedian.*\}', script.string, re.DOTALL)
if match:
data = json.loads(match.group(0))
salaries.append({
'job_title': job_title,
'location': location,
'median_pay': data.get('totalPayMedian', ''),
'pay_range': {
'low': data.get('totalPayPercentile10', ''),
'high': data.get('totalPayPercentile90', ''),
}
})
except:
continue
# Also parse visible HTML salary ranges
for elem in soup.select('[data-test="salary-range"]'):
salaries.append({'raw': elem.text.strip()})
return salaries
salaries = get_salary_data("Software Engineer", "San Francisco, CA")
for s in salaries:
print(json.dumps(s, indent=2))
Handling Glassdoor's Anti-Bot
Glassdoor uses several detection layers:
-
TLS fingerprinting — Fixed by curl_cffi with
impersonate="chrome124" - Cookie-based rate limiting — Rotate cookies or use fresh session
- Bot score from JS signals — Use full Playwright for heavy scraping
- Login wall — Need valid session for full data
import time, random
def glassdoor_delay():
"""Human-like delays for Glassdoor scraping."""
# Glassdoor is sensitive to rapid requests
# 5-15 seconds between page loads recommended
time.sleep(random.uniform(5, 15))
def glassdoor_session():
"""Create a warm Glassdoor session."""
from curl_cffi import requests as cf_requests
session = cf_requests.Session()
# Required warm-up sequence
session.get("https://www.glassdoor.com/", impersonate="chrome124")
glassdoor_delay()
# Visit a category page before target
session.get("https://www.glassdoor.com/Reviews/index.htm", impersonate="chrome124",
headers={"Referer": "https://www.glassdoor.com/"})
glassdoor_delay()
return session
Rate Limits and Scale
Practical limits for Glassdoor:
- Without login: 10-20 pages per IP per day (conservative estimate)
- With session cookies: 50-100 pages per session before challenge
- Commercial use: Use Glassdoor's official API (requires partnership) or data providers
For competitive salary benchmarking at scale, consider:
- Built-in datasets: Levels.fyi, Blind (professional network), LinkedIn Salary
- API providers: People Data Labs, Diffbot (expensive but reliable)
- Manual benchmarking: Glassdoor + LinkedIn + Indeed + AngelList for triangulation
Related Articles
- Web Scraping Without Getting Banned in 2026 — Anti-detection playbook
- How to Scrape LinkedIn Job Data at Scale — Related career data source
- curl_cffi Stopped Working? Here's What to Try Next — TLS fingerprint debugging
Save hours on scraping setup: The $29 Apify Scrapers Bundle includes 35+ production-ready actors — Google SERP, LinkedIn, Amazon, TikTok, contact info, and more. Pre-configured inputs, working on day one.
Related Tools
Pre-built solutions for this use case:
Top comments (0)