DEV Community

agenthustler
agenthustler

Posted on

How to Scrape Behind Login Walls: Session Management in Python

Many valuable datasets live behind login walls — job boards, business directories, analytics dashboards, and member-only content. Scraping authenticated pages requires managing sessions, cookies, and tokens properly.

In this guide, I'll show you how to handle authentication for web scraping in Python, ethically and effectively.

Important: Legal and Ethical Considerations

Before scraping behind login walls, ensure you:

  • Have a legitimate account — never use stolen credentials
  • Have the right to access the data — check the platform's ToS
  • Are collecting your own data or data you have authorization to access
  • Respect rate limits — authenticated sessions are easier to track

Method 1: Session-Based Authentication (Form Login)

Most websites use form-based login with session cookies:

import requests
from bs4 import BeautifulSoup

def login_with_session(login_url, username, password):
    session = requests.Session()

    # Step 1: Get the login page (for CSRF tokens)
    login_page = session.get(login_url)
    soup = BeautifulSoup(login_page.text, "html.parser")

    # Step 2: Extract CSRF token if present
    csrf_token = None
    csrf_input = soup.select_one("input[name='csrf_token']")
    if csrf_input:
        csrf_token = csrf_input["value"]

    # Step 3: Submit login form
    login_data = {
        "username": username,
        "password": password,
    }
    if csrf_token:
        login_data["csrf_token"] = csrf_token

    response = session.post(login_url, data=login_data)

    # Step 4: Verify login succeeded
    if response.status_code == 200 and "dashboard" in response.url:
        print("Login successful!")
        return session
    else:
        print("Login failed")
        return None

# Usage
session = login_with_session(
    "https://example.com/login",
    "your_username",
    "your_password"
)

if session:
    # Now use the authenticated session for all requests
    protected_page = session.get("https://example.com/dashboard/data")
    print(protected_page.text[:500])
Enter fullscreen mode Exit fullscreen mode

Method 2: API Token Authentication

Many modern apps use JWT or API tokens:

import requests

def login_with_api_token(api_url, email, password):
    # Step 1: Authenticate and get token
    auth_response = requests.post(f"{api_url}/auth/login", json={
        "email": email,
        "password": password
    })

    if auth_response.status_code != 200:
        print(f"Auth failed: {auth_response.status_code}")
        return None

    token = auth_response.json()["access_token"]

    # Step 2: Create session with token in headers
    session = requests.Session()
    session.headers.update({
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    })

    return session

def refresh_token_if_needed(session, refresh_url, refresh_token):
    """Handle token expiration."""
    response = requests.post(refresh_url, json={
        "refresh_token": refresh_token
    })
    if response.status_code == 200:
        new_token = response.json()["access_token"]
        session.headers["Authorization"] = f"Bearer {new_token}"
    return session
Enter fullscreen mode Exit fullscreen mode

Method 3: Cookie-Based Authentication

Sometimes you need to extract cookies from a browser session:

import requests
import json
from pathlib import Path

def create_session_from_cookies(cookies_dict):
    """Create a requests session from exported cookies."""
    session = requests.Session()
    for name, value in cookies_dict.items():
        session.cookies.set(name, value)
    return session

# Save and load cookies for reuse
def save_cookies(session, filepath="cookies.json"):
    cookies = {c.name: c.value for c in session.cookies}
    Path(filepath).write_text(json.dumps(cookies))

def load_cookies(filepath="cookies.json"):
    if Path(filepath).exists():
        cookies = json.loads(Path(filepath).read_text())
        return create_session_from_cookies(cookies)
    return None
Enter fullscreen mode Exit fullscreen mode

Method 4: Browser-Based Login with Playwright

For complex login flows (2FA, CAPTCHAs, OAuth):

from playwright.sync_api import sync_playwright
import json

def browser_login(login_url, username, password):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)  # Visible for CAPTCHA
        context = browser.new_context()
        page = context.new_page()

        page.goto(login_url)

        # Fill login form
        page.fill("input[name='username']", username)
        page.fill("input[name='password']", password)
        page.click("button[type='submit']")

        # Wait for login to complete
        page.wait_for_url("**/dashboard**", timeout=30000)

        # Extract cookies for use with requests
        cookies = context.cookies()

        browser.close()
        return cookies

def cookies_to_session(playwright_cookies):
    """Convert Playwright cookies to a requests session."""
    session = requests.Session()
    for cookie in playwright_cookies:
        session.cookies.set(
            cookie["name"],
            cookie["value"],
            domain=cookie["domain"],
            path=cookie.get("path", "/")
        )
    return session
Enter fullscreen mode Exit fullscreen mode

Persisting Sessions Across Runs

Save browser state to avoid re-logging in:

from playwright.sync_api import sync_playwright
from pathlib import Path

def get_persistent_session(login_url, storage_path="auth_state.json"):
    with sync_playwright() as p:
        if Path(storage_path).exists():
            # Reuse saved session
            context = p.chromium.launch(headless=True).new_context(
                storage_state=storage_path
            )
            page = context.new_page()
            page.goto(login_url.replace("/login", "/dashboard"))

            if "login" not in page.url:  # Session still valid
                return context

        # Fresh login needed
        browser = p.chromium.launch(headless=False)
        context = browser.new_context()
        page = context.new_page()
        page.goto(login_url)

        # Perform login...
        page.fill("#username", "your_user")
        page.fill("#password", "your_pass")
        page.click("#login-btn")
        page.wait_for_url("**/dashboard**")

        # Save session state
        context.storage_state(path=storage_path)
        return context
Enter fullscreen mode Exit fullscreen mode

Handling Session Expiration

import time

class AuthenticatedScraper:
    def __init__(self, login_url, credentials):
        self.login_url = login_url
        self.credentials = credentials
        self.session = None
        self.login_time = 0
        self.session_lifetime = 3600  # Re-login every hour

    def ensure_logged_in(self):
        if not self.session or (time.time() - self.login_time > self.session_lifetime):
            self.session = login_with_session(
                self.login_url,
                self.credentials["username"],
                self.credentials["password"]
            )
            self.login_time = time.time()
        return self.session

    def get(self, url):
        session = self.ensure_logged_in()
        response = session.get(url)

        # Check if session expired mid-scrape
        if response.status_code == 401 or "login" in response.url:
            self.session = None  # Force re-login
            session = self.ensure_logged_in()
            response = session.get(url)

        return response

# Usage
scraper = AuthenticatedScraper(
    "https://example.com/login",
    {"username": "user", "password": "pass"}
)

data = scraper.get("https://example.com/api/protected-data")
print(data.json())
Enter fullscreen mode Exit fullscreen mode

Best Practices

  1. Reuse sessions — don't login for every request
  2. Save cookies to disk — persist sessions across script runs
  3. Handle expiration gracefully — detect 401s and re-authenticate
  4. Use environment variables for credentials — never hardcode them
  5. Rate limit authenticated requests — sites track logged-in users more closely
  6. Log out when done — clean up your sessions

Scaling Authenticated Scraping

For large-scale authenticated scraping, you'll need reliable proxy rotation to prevent your sessions from being flagged. ThorData provides sticky residential proxies that maintain consistent IP addresses throughout your session, preventing authentication disruptions.

Conclusion

Authenticated scraping adds complexity but opens up access to valuable datasets. Start with form-based login for simple sites, use API tokens for modern apps, and fall back to Playwright for complex auth flows. Always persist your sessions and handle expiration gracefully.

Happy scraping!

Top comments (0)