Many valuable datasets live behind login walls — job boards, business directories, analytics dashboards, and member-only content. Scraping authenticated pages requires managing sessions, cookies, and tokens properly.
In this guide, I'll show you how to handle authentication for web scraping in Python, ethically and effectively.
Important: Legal and Ethical Considerations
Before scraping behind login walls, ensure you:
- Have a legitimate account — never use stolen credentials
- Have the right to access the data — check the platform's ToS
- Are collecting your own data or data you have authorization to access
- Respect rate limits — authenticated sessions are easier to track
Method 1: Session-Based Authentication (Form Login)
Most websites use form-based login with session cookies:
import requests
from bs4 import BeautifulSoup
def login_with_session(login_url, username, password):
session = requests.Session()
# Step 1: Get the login page (for CSRF tokens)
login_page = session.get(login_url)
soup = BeautifulSoup(login_page.text, "html.parser")
# Step 2: Extract CSRF token if present
csrf_token = None
csrf_input = soup.select_one("input[name='csrf_token']")
if csrf_input:
csrf_token = csrf_input["value"]
# Step 3: Submit login form
login_data = {
"username": username,
"password": password,
}
if csrf_token:
login_data["csrf_token"] = csrf_token
response = session.post(login_url, data=login_data)
# Step 4: Verify login succeeded
if response.status_code == 200 and "dashboard" in response.url:
print("Login successful!")
return session
else:
print("Login failed")
return None
# Usage
session = login_with_session(
"https://example.com/login",
"your_username",
"your_password"
)
if session:
# Now use the authenticated session for all requests
protected_page = session.get("https://example.com/dashboard/data")
print(protected_page.text[:500])
Method 2: API Token Authentication
Many modern apps use JWT or API tokens:
import requests
def login_with_api_token(api_url, email, password):
# Step 1: Authenticate and get token
auth_response = requests.post(f"{api_url}/auth/login", json={
"email": email,
"password": password
})
if auth_response.status_code != 200:
print(f"Auth failed: {auth_response.status_code}")
return None
token = auth_response.json()["access_token"]
# Step 2: Create session with token in headers
session = requests.Session()
session.headers.update({
"Authorization": f"Bearer {token}",
"Content-Type": "application/json"
})
return session
def refresh_token_if_needed(session, refresh_url, refresh_token):
"""Handle token expiration."""
response = requests.post(refresh_url, json={
"refresh_token": refresh_token
})
if response.status_code == 200:
new_token = response.json()["access_token"]
session.headers["Authorization"] = f"Bearer {new_token}"
return session
Method 3: Cookie-Based Authentication
Sometimes you need to extract cookies from a browser session:
import requests
import json
from pathlib import Path
def create_session_from_cookies(cookies_dict):
"""Create a requests session from exported cookies."""
session = requests.Session()
for name, value in cookies_dict.items():
session.cookies.set(name, value)
return session
# Save and load cookies for reuse
def save_cookies(session, filepath="cookies.json"):
cookies = {c.name: c.value for c in session.cookies}
Path(filepath).write_text(json.dumps(cookies))
def load_cookies(filepath="cookies.json"):
if Path(filepath).exists():
cookies = json.loads(Path(filepath).read_text())
return create_session_from_cookies(cookies)
return None
Method 4: Browser-Based Login with Playwright
For complex login flows (2FA, CAPTCHAs, OAuth):
from playwright.sync_api import sync_playwright
import json
def browser_login(login_url, username, password):
with sync_playwright() as p:
browser = p.chromium.launch(headless=False) # Visible for CAPTCHA
context = browser.new_context()
page = context.new_page()
page.goto(login_url)
# Fill login form
page.fill("input[name='username']", username)
page.fill("input[name='password']", password)
page.click("button[type='submit']")
# Wait for login to complete
page.wait_for_url("**/dashboard**", timeout=30000)
# Extract cookies for use with requests
cookies = context.cookies()
browser.close()
return cookies
def cookies_to_session(playwright_cookies):
"""Convert Playwright cookies to a requests session."""
session = requests.Session()
for cookie in playwright_cookies:
session.cookies.set(
cookie["name"],
cookie["value"],
domain=cookie["domain"],
path=cookie.get("path", "/")
)
return session
Persisting Sessions Across Runs
Save browser state to avoid re-logging in:
from playwright.sync_api import sync_playwright
from pathlib import Path
def get_persistent_session(login_url, storage_path="auth_state.json"):
with sync_playwright() as p:
if Path(storage_path).exists():
# Reuse saved session
context = p.chromium.launch(headless=True).new_context(
storage_state=storage_path
)
page = context.new_page()
page.goto(login_url.replace("/login", "/dashboard"))
if "login" not in page.url: # Session still valid
return context
# Fresh login needed
browser = p.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
page.goto(login_url)
# Perform login...
page.fill("#username", "your_user")
page.fill("#password", "your_pass")
page.click("#login-btn")
page.wait_for_url("**/dashboard**")
# Save session state
context.storage_state(path=storage_path)
return context
Handling Session Expiration
import time
class AuthenticatedScraper:
def __init__(self, login_url, credentials):
self.login_url = login_url
self.credentials = credentials
self.session = None
self.login_time = 0
self.session_lifetime = 3600 # Re-login every hour
def ensure_logged_in(self):
if not self.session or (time.time() - self.login_time > self.session_lifetime):
self.session = login_with_session(
self.login_url,
self.credentials["username"],
self.credentials["password"]
)
self.login_time = time.time()
return self.session
def get(self, url):
session = self.ensure_logged_in()
response = session.get(url)
# Check if session expired mid-scrape
if response.status_code == 401 or "login" in response.url:
self.session = None # Force re-login
session = self.ensure_logged_in()
response = session.get(url)
return response
# Usage
scraper = AuthenticatedScraper(
"https://example.com/login",
{"username": "user", "password": "pass"}
)
data = scraper.get("https://example.com/api/protected-data")
print(data.json())
Best Practices
- Reuse sessions — don't login for every request
- Save cookies to disk — persist sessions across script runs
- Handle expiration gracefully — detect 401s and re-authenticate
- Use environment variables for credentials — never hardcode them
- Rate limit authenticated requests — sites track logged-in users more closely
- Log out when done — clean up your sessions
Scaling Authenticated Scraping
For large-scale authenticated scraping, you'll need reliable proxy rotation to prevent your sessions from being flagged. ThorData provides sticky residential proxies that maintain consistent IP addresses throughout your session, preventing authentication disruptions.
Conclusion
Authenticated scraping adds complexity but opens up access to valuable datasets. Start with form-based login for simple sites, use API tokens for modern apps, and fall back to Playwright for complex auth flows. Always persist your sessions and handle expiration gracefully.
Happy scraping!
Top comments (0)