How to Scrape Websites That Require Login Without Using Selenium
Most tutorials show you how to handle logins with Selenium — but Selenium is slow, resource-heavy, and easily detected. For 80% of login-required sites, you can authenticate much more efficiently with just Python requests.
How Login Works (What You're Actually Doing)
When you "log in" to a website, three things happen:
- Your browser sends credentials (username + password) to the server
- The server validates them and returns a session cookie (or JWT token)
- Your browser sends that cookie with every subsequent request — the server recognizes you as logged in
Web scraping a login-required site means replicating steps 1-3 programmatically, then using the resulting session for your scraping requests.
Method 1: Form-Based Login (Most Common)
Traditional websites use HTML forms that POST credentials:
import requests
from bs4 import BeautifulSoup
def create_session_with_login(login_url: str, username: str, password: str) -> requests.Session:
"""
Log in to a form-based website and return an authenticated session.
"""
session = requests.Session()
# Required headers to look like a real browser
session.headers.update({
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
})
# Step 1: GET the login page to get any CSRF tokens
login_page = session.get(login_url)
soup = BeautifulSoup(login_page.text, 'html.parser')
# Step 2: Find hidden fields (CSRF tokens, nonces, etc.)
form = soup.find('form')
hidden_fields = {}
if form:
for hidden in form.find_all('input', type='hidden'):
name = hidden.get('name')
value = hidden.get('value', '')
if name:
hidden_fields[name] = value
# Step 3: Build login payload (add credentials to hidden fields)
payload = {
**hidden_fields, # Include all hidden fields (CSRF, etc.)
'username': username, # Find actual field names by inspecting the form
'password': password,
}
# Step 4: POST credentials
# Use the form's action URL, defaulting to the login page URL
action_url = form.get('action', login_url) if form else login_url
if action_url.startswith('/'):
from urllib.parse import urlparse
base = urlparse(login_url)
action_url = f"{base.scheme}://{base.netloc}{action_url}"
response = session.post(
action_url,
data=payload,
headers={"Referer": login_url},
allow_redirects=True
)
# Step 5: Verify login succeeded
# Check for login failure indicators
if 'password' in response.url or 'login' in response.url:
raise ValueError(f"Login failed - still on login page: {response.url}")
if any(phrase in response.text.lower() for phrase in ['invalid password', 'login failed', 'incorrect credentials']):
raise ValueError("Login failed - error message found in response")
print(f"Logged in successfully. Session cookies: {dict(session.cookies)}")
return session
# Usage
session = create_session_with_login(
login_url="https://target-site.com/login",
username="your@email.com",
password="yourpassword"
)
# Now use the session for authenticated requests
data = session.get("https://target-site.com/dashboard")
print(data.status_code)
Finding the Right Form Field Names
This is the step most tutorials skip. You need to inspect the actual form:
from bs4 import BeautifulSoup
import requests
# Step 1: Get login page
r = requests.get("https://target-site.com/login")
soup = BeautifulSoup(r.text, 'html.parser')
# Step 2: Find all forms
for form in soup.find_all('form'):
print(f"\nForm action: {form.get('action')}")
print(f"Form method: {form.get('method', 'GET')}")
print("Fields:")
for inp in form.find_all('input'):
print(f" name={inp.get('name')!r} type={inp.get('type')!r} value={inp.get('value','')[:20]!r}")
Run this to see exactly what fields the form uses. Common patterns:
-
usernameoremailoruserorlogin -
passwordorpasswdorpass -
csrfmiddlewaretoken(Django) or_token(Laravel) orauthenticity_token(Rails)
Method 2: API/JSON Login
Many modern sites use JSON APIs for authentication:
import requests
def login_json_api(api_base: str, username: str, password: str) -> requests.Session:
"""
Authenticate via JSON API (REST or GraphQL).
"""
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Content-Type": "application/json",
"Accept": "application/json",
})
# Try common API login endpoints
login_endpoints = [
"/api/auth/login",
"/api/v1/auth/login",
"/api/login",
"/auth/login",
"/api/users/login",
"/api/sessions",
]
payload = {"username": username, "password": password}
# Or sometimes: {"email": username, "password": password}
for endpoint in login_endpoints:
try:
r = session.post(f"{api_base}{endpoint}", json=payload, timeout=10)
if r.status_code in (200, 201):
data = r.json()
# Extract token from response
token = (
data.get('token') or
data.get('access_token') or
data.get('data', {}).get('token') or
data.get('user', {}).get('token') or
''
)
if token:
# Set authorization header for all future requests
session.headers['Authorization'] = f"Bearer {token}"
print(f"Logged in via {endpoint}, token: {token[:20]}...")
return session
else:
# Token might be in cookies
if session.cookies:
print(f"Logged in via {endpoint}, using session cookies")
return session
except:
continue
raise ValueError("Could not find working API login endpoint")
# Usage
session = login_json_api("https://target-site.com", "user@email.com", "password123")
data = session.get("https://target-site.com/api/v1/protected-data")
print(data.json())
Method 3: Using Saved Browser Cookies
If you can log in via your browser once and export the cookies, you can reuse them indefinitely (until they expire):
import requests
import json
def session_from_cookies(cookies_path: str, base_url: str) -> requests.Session:
"""
Create authenticated session from browser cookies (exported via Cookie Editor extension).
Cookies usually last 7-30 days before expiring.
"""
with open(cookies_path) as f:
cookies_data = json.load(f)
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
})
# Load all cookies into session
for cookie in cookies_data:
session.cookies.set(
cookie['name'],
cookie['value'],
domain=cookie.get('domain', '').lstrip('.'),
path=cookie.get('path', '/')
)
# Verify session is valid
r = session.get(f"{base_url}/dashboard", allow_redirects=False)
if r.status_code == 302: # Redirect to login = not authenticated
raise ValueError("Cookies expired or invalid - please export fresh cookies")
print(f"Session active: {r.status_code}")
return session
# Usage
session = session_from_cookies(
cookies_path="/path/to/exported_cookies.json",
base_url="https://target-site.com"
)
Handling CSRF Tokens
Many sites require a CSRF token that changes each request:
import requests
from bs4 import BeautifulSoup
import re
class CSRFSession:
"""Session that automatically handles CSRF tokens."""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
})
self._csrf_token = None
def _extract_csrf(self, response) -> str:
"""Extract CSRF token from response."""
# From meta tag (Django, Rails)
soup = BeautifulSoup(response.text, 'html.parser')
meta = soup.find('meta', attrs={'name': ['csrf-token', '_token', 'csrfToken']})
if meta:
return meta.get('content', '')
# From hidden input field
csrf_input = soup.find('input', {'name': ['csrfmiddlewaretoken', '_token', '_csrf', 'csrf_token']})
if csrf_input:
return csrf_input.get('value', '')
# From cookies
csrf_cookie = self.session.cookies.get('XSRF-TOKEN') or self.session.cookies.get('csrf_token')
if csrf_cookie:
return csrf_cookie
# From JS variable in page
match = re.search(r'csrf[_-]?token["\s]*:["\s]*(["\'])(.+?)\1', response.text, re.IGNORECASE)
if match:
return match.group(2)
return ''
def get(self, url: str, **kwargs) -> requests.Response:
r = self.session.get(url, **kwargs)
self._csrf_token = self._extract_csrf(r)
return r
def post(self, url: str, data: dict = None, **kwargs) -> requests.Response:
if self._csrf_token and data is not None:
# Add CSRF token to POST data
data['csrfmiddlewaretoken'] = self._csrf_token # Django
data['_token'] = self._csrf_token # Laravel
if self._csrf_token:
# Also add as header
self.session.headers['X-CSRF-Token'] = self._csrf_token
self.session.headers['X-CSRFToken'] = self._csrf_token
return self.session.post(url, data=data, **kwargs)
Maintaining Long-Running Sessions
Sessions expire. Here's how to handle that gracefully:
import requests
import time
from functools import wraps
class ResilientAuthSession:
def __init__(self, login_func):
self.login_func = login_func
self.session = None
self._authenticate()
def _authenticate(self):
self.session = self.login_func()
self._last_auth = time.time()
print(f"Authenticated at {time.strftime('%H:%M:%S')}")
def _is_auth_valid(self, response: requests.Response) -> bool:
"""Check if we got redirected to login."""
return not (
response.status_code in (401, 403) or
'login' in response.url or
'sign-in' in response.url or
'unauthorized' in response.text.lower()[:200]
)
def get(self, url: str, **kwargs) -> requests.Response:
r = self.session.get(url, **kwargs)
if not self._is_auth_valid(r):
print("Session expired, re-authenticating...")
self._authenticate()
r = self.session.get(url, **kwargs)
return r
# Usage
def my_login():
session = requests.Session()
session.post("https://site.com/login", data={"username": "u", "password": "p"})
return session
resilient = ResilientAuthSession(my_login)
data = resilient.get("https://site.com/protected-page")
Debugging Login Issues
When login fails:
import requests
session = requests.Session()
session.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
# Enable debug logging
import logging
logging.basicConfig(level=logging.DEBUG)
# 1. Check what the login response actually returns
r = session.post("https://site.com/login",
data={"username": "u", "password": "p"},
allow_redirects=False) # Don't follow redirects
print(f"Status: {r.status_code}")
print(f"Location: {r.headers.get('Location', 'No redirect')}")
print(f"Cookies set: {dict(session.cookies)}")
print(f"Response body: {r.text[:500]}")
Common fixes:
-
allow_redirects=False— see what the server actually returns without following redirects - Check for CSRF token requirements
- Look for additional required headers (
X-Requested-With,Origin,Referer) - Some sites require visiting the login page first to set initial cookies before POSTing credentials
Related Articles
- Python Web Scraping Tutorial for Beginners 2026 — Foundation
- Web Scraping Without Getting Banned in 2026 — Anti-detection
- Async Web Scraping in Python — Scale your authenticated scraper
Related Tools
🛒 Tools & Resources
Apify Scrapers Bundle ($29) — 30+ ready-to-use scrapers
n8n AI Automation Pack ($39) — 5 production-ready workflows
Top comments (0)