DEV Community

Vhub Systems
Vhub Systems

Posted on

How to Scrape Websites That Require Login Without Using Selenium

How to Scrape Websites That Require Login Without Using Selenium

Most tutorials show you how to handle logins with Selenium — but Selenium is slow, resource-heavy, and easily detected. For 80% of login-required sites, you can authenticate much more efficiently with just Python requests.

How Login Works (What You're Actually Doing)

When you "log in" to a website, three things happen:

  1. Your browser sends credentials (username + password) to the server
  2. The server validates them and returns a session cookie (or JWT token)
  3. Your browser sends that cookie with every subsequent request — the server recognizes you as logged in

Web scraping a login-required site means replicating steps 1-3 programmatically, then using the resulting session for your scraping requests.

Method 1: Form-Based Login (Most Common)

Traditional websites use HTML forms that POST credentials:

import requests
from bs4 import BeautifulSoup

def create_session_with_login(login_url: str, username: str, password: str) -> requests.Session:
    """
    Log in to a form-based website and return an authenticated session.
    """
    session = requests.Session()

    # Required headers to look like a real browser
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
    })

    # Step 1: GET the login page to get any CSRF tokens
    login_page = session.get(login_url)
    soup = BeautifulSoup(login_page.text, 'html.parser')

    # Step 2: Find hidden fields (CSRF tokens, nonces, etc.)
    form = soup.find('form')
    hidden_fields = {}

    if form:
        for hidden in form.find_all('input', type='hidden'):
            name = hidden.get('name')
            value = hidden.get('value', '')
            if name:
                hidden_fields[name] = value

    # Step 3: Build login payload (add credentials to hidden fields)
    payload = {
        **hidden_fields,          # Include all hidden fields (CSRF, etc.)
        'username': username,     # Find actual field names by inspecting the form
        'password': password,
    }

    # Step 4: POST credentials
    # Use the form's action URL, defaulting to the login page URL
    action_url = form.get('action', login_url) if form else login_url
    if action_url.startswith('/'):
        from urllib.parse import urlparse
        base = urlparse(login_url)
        action_url = f"{base.scheme}://{base.netloc}{action_url}"

    response = session.post(
        action_url,
        data=payload,
        headers={"Referer": login_url},
        allow_redirects=True
    )

    # Step 5: Verify login succeeded
    # Check for login failure indicators
    if 'password' in response.url or 'login' in response.url:
        raise ValueError(f"Login failed - still on login page: {response.url}")
    if any(phrase in response.text.lower() for phrase in ['invalid password', 'login failed', 'incorrect credentials']):
        raise ValueError("Login failed - error message found in response")

    print(f"Logged in successfully. Session cookies: {dict(session.cookies)}")
    return session

# Usage
session = create_session_with_login(
    login_url="https://target-site.com/login",
    username="your@email.com",
    password="yourpassword"
)

# Now use the session for authenticated requests
data = session.get("https://target-site.com/dashboard")
print(data.status_code)
Enter fullscreen mode Exit fullscreen mode

Finding the Right Form Field Names

This is the step most tutorials skip. You need to inspect the actual form:

from bs4 import BeautifulSoup
import requests

# Step 1: Get login page
r = requests.get("https://target-site.com/login")
soup = BeautifulSoup(r.text, 'html.parser')

# Step 2: Find all forms
for form in soup.find_all('form'):
    print(f"\nForm action: {form.get('action')}")
    print(f"Form method: {form.get('method', 'GET')}")
    print("Fields:")
    for inp in form.find_all('input'):
        print(f"  name={inp.get('name')!r} type={inp.get('type')!r} value={inp.get('value','')[:20]!r}")
Enter fullscreen mode Exit fullscreen mode

Run this to see exactly what fields the form uses. Common patterns:

  • username or email or user or login
  • password or passwd or pass
  • csrfmiddlewaretoken (Django) or _token (Laravel) or authenticity_token (Rails)

Method 2: API/JSON Login

Many modern sites use JSON APIs for authentication:

import requests

def login_json_api(api_base: str, username: str, password: str) -> requests.Session:
    """
    Authenticate via JSON API (REST or GraphQL).
    """
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Content-Type": "application/json",
        "Accept": "application/json",
    })

    # Try common API login endpoints
    login_endpoints = [
        "/api/auth/login",
        "/api/v1/auth/login",
        "/api/login",
        "/auth/login",
        "/api/users/login",
        "/api/sessions",
    ]

    payload = {"username": username, "password": password}
    # Or sometimes: {"email": username, "password": password}

    for endpoint in login_endpoints:
        try:
            r = session.post(f"{api_base}{endpoint}", json=payload, timeout=10)
            if r.status_code in (200, 201):
                data = r.json()

                # Extract token from response
                token = (
                    data.get('token') or
                    data.get('access_token') or
                    data.get('data', {}).get('token') or
                    data.get('user', {}).get('token') or
                    ''
                )

                if token:
                    # Set authorization header for all future requests
                    session.headers['Authorization'] = f"Bearer {token}"
                    print(f"Logged in via {endpoint}, token: {token[:20]}...")
                    return session
                else:
                    # Token might be in cookies
                    if session.cookies:
                        print(f"Logged in via {endpoint}, using session cookies")
                        return session
        except:
            continue

    raise ValueError("Could not find working API login endpoint")

# Usage
session = login_json_api("https://target-site.com", "user@email.com", "password123")
data = session.get("https://target-site.com/api/v1/protected-data")
print(data.json())
Enter fullscreen mode Exit fullscreen mode

Method 3: Using Saved Browser Cookies

If you can log in via your browser once and export the cookies, you can reuse them indefinitely (until they expire):

import requests
import json

def session_from_cookies(cookies_path: str, base_url: str) -> requests.Session:
    """
    Create authenticated session from browser cookies (exported via Cookie Editor extension).
    Cookies usually last 7-30 days before expiring.
    """
    with open(cookies_path) as f:
        cookies_data = json.load(f)

    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    })

    # Load all cookies into session
    for cookie in cookies_data:
        session.cookies.set(
            cookie['name'],
            cookie['value'],
            domain=cookie.get('domain', '').lstrip('.'),
            path=cookie.get('path', '/')
        )

    # Verify session is valid
    r = session.get(f"{base_url}/dashboard", allow_redirects=False)

    if r.status_code == 302:  # Redirect to login = not authenticated
        raise ValueError("Cookies expired or invalid - please export fresh cookies")

    print(f"Session active: {r.status_code}")
    return session

# Usage
session = session_from_cookies(
    cookies_path="/path/to/exported_cookies.json",
    base_url="https://target-site.com"
)
Enter fullscreen mode Exit fullscreen mode

Handling CSRF Tokens

Many sites require a CSRF token that changes each request:

import requests
from bs4 import BeautifulSoup
import re

class CSRFSession:
    """Session that automatically handles CSRF tokens."""

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
        })
        self._csrf_token = None

    def _extract_csrf(self, response) -> str:
        """Extract CSRF token from response."""
        # From meta tag (Django, Rails)
        soup = BeautifulSoup(response.text, 'html.parser')

        meta = soup.find('meta', attrs={'name': ['csrf-token', '_token', 'csrfToken']})
        if meta:
            return meta.get('content', '')

        # From hidden input field
        csrf_input = soup.find('input', {'name': ['csrfmiddlewaretoken', '_token', '_csrf', 'csrf_token']})
        if csrf_input:
            return csrf_input.get('value', '')

        # From cookies
        csrf_cookie = self.session.cookies.get('XSRF-TOKEN') or self.session.cookies.get('csrf_token')
        if csrf_cookie:
            return csrf_cookie

        # From JS variable in page
        match = re.search(r'csrf[_-]?token["\s]*:["\s]*(["\'])(.+?)\1', response.text, re.IGNORECASE)
        if match:
            return match.group(2)

        return ''

    def get(self, url: str, **kwargs) -> requests.Response:
        r = self.session.get(url, **kwargs)
        self._csrf_token = self._extract_csrf(r)
        return r

    def post(self, url: str, data: dict = None, **kwargs) -> requests.Response:
        if self._csrf_token and data is not None:
            # Add CSRF token to POST data
            data['csrfmiddlewaretoken'] = self._csrf_token  # Django
            data['_token'] = self._csrf_token               # Laravel

        if self._csrf_token:
            # Also add as header
            self.session.headers['X-CSRF-Token'] = self._csrf_token
            self.session.headers['X-CSRFToken'] = self._csrf_token

        return self.session.post(url, data=data, **kwargs)
Enter fullscreen mode Exit fullscreen mode

Maintaining Long-Running Sessions

Sessions expire. Here's how to handle that gracefully:

import requests
import time
from functools import wraps

class ResilientAuthSession:
    def __init__(self, login_func):
        self.login_func = login_func
        self.session = None
        self._authenticate()

    def _authenticate(self):
        self.session = self.login_func()
        self._last_auth = time.time()
        print(f"Authenticated at {time.strftime('%H:%M:%S')}")

    def _is_auth_valid(self, response: requests.Response) -> bool:
        """Check if we got redirected to login."""
        return not (
            response.status_code in (401, 403) or
            'login' in response.url or
            'sign-in' in response.url or
            'unauthorized' in response.text.lower()[:200]
        )

    def get(self, url: str, **kwargs) -> requests.Response:
        r = self.session.get(url, **kwargs)

        if not self._is_auth_valid(r):
            print("Session expired, re-authenticating...")
            self._authenticate()
            r = self.session.get(url, **kwargs)

        return r

# Usage
def my_login():
    session = requests.Session()
    session.post("https://site.com/login", data={"username": "u", "password": "p"})
    return session

resilient = ResilientAuthSession(my_login)
data = resilient.get("https://site.com/protected-page")
Enter fullscreen mode Exit fullscreen mode

Debugging Login Issues

When login fails:

import requests

session = requests.Session()
session.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'

# Enable debug logging
import logging
logging.basicConfig(level=logging.DEBUG)

# 1. Check what the login response actually returns
r = session.post("https://site.com/login", 
    data={"username": "u", "password": "p"},
    allow_redirects=False)  # Don't follow redirects

print(f"Status: {r.status_code}")
print(f"Location: {r.headers.get('Location', 'No redirect')}")
print(f"Cookies set: {dict(session.cookies)}")
print(f"Response body: {r.text[:500]}")
Enter fullscreen mode Exit fullscreen mode

Common fixes:

  • allow_redirects=False — see what the server actually returns without following redirects
  • Check for CSRF token requirements
  • Look for additional required headers (X-Requested-With, Origin, Referer)
  • Some sites require visiting the login page first to set initial cookies before POSTing credentials

Related Articles


Related Tools


🛒 Tools & Resources

Apify Scrapers Bundle ($29) — 30+ ready-to-use scrapers
n8n AI Automation Pack ($39) — 5 production-ready workflows

Top comments (0)