How to Extract URLs in Bulk From a Site That May Be Paywalled

#webdev #python #tutorial

How to Extract URLs in Bulk From a Site That May Be Paywalled

You need a list of article URLs, product pages, or document links from a site — but the content might be behind a paywall, login, or access restriction. Here's how to get the URLs (even if not the full content) without hitting paywalls on every request.

Strategy: Sitemap First, Crawl Second

Most sites publish their URL structure in sitemaps even when the content is paywalled. This is free — no login needed:

import requests
from xml.etree import ElementTree
import urllib.parse

def get_sitemap_urls(domain):
    """Extract all URLs from a site's sitemap"""
    urls = []

    # Try common sitemap locations
    sitemap_paths = [
        "/sitemap.xml",
        "/sitemap_index.xml", 
        "/sitemaps/sitemap.xml",
        "/news-sitemap.xml",
    ]

    headers = {"User-Agent": "Mozilla/5.0 (compatible; SitemapBot/1.0)"}

    for path in sitemap_paths:
        url = f"https://{domain}{path}"
        r = requests.get(url, headers=headers, timeout=10)

        if r.status_code != 200:
            continue

        # Parse XML sitemap
        root = ElementTree.fromstring(r.content)
        ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}

        # Handle sitemap index (nested sitemaps)
        for sitemap in root.findall(".//sm:sitemap/sm:loc", ns):
            child_urls = get_sitemap_urls_from(sitemap.text, headers)
            urls.extend(child_urls)

        # Handle direct URL list
        for loc in root.findall(".//sm:url/sm:loc", ns):
            urls.append(loc.text)

        if urls:
            break

    return urls

def get_sitemap_urls_from(sitemap_url, headers):
    """Parse a single sitemap XML file"""
    r = requests.get(sitemap_url, headers=headers, timeout=10)
    if r.status_code != 200:
        return []

    root = ElementTree.fromstring(r.content)
    ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
    return [loc.text for loc in root.findall(".//sm:url/sm:loc", ns)]

# Usage
domain = "example.com"
all_urls = get_sitemap_urls(domain)
print(f"Found {len(all_urls)} URLs")
for url in all_urls[:10]:
    print(url)

Find Sitemaps via robots.txt

The standard place to find sitemap locations:

import requests, re

def find_sitemaps_from_robots(domain):
    r = requests.get(f"https://{domain}/robots.txt", timeout=10,
                     headers={"User-Agent": "Mozilla/5.0"})
    if r.status_code != 200:
        return []

    # Extract Sitemap: lines
    sitemaps = re.findall(r'^Sitemap:\s*(.+)$', r.text, re.MULTILINE)
    return [s.strip() for s in sitemaps]

sitemaps = find_sitemaps_from_robots("nytimes.com")
print("Sitemaps:", sitemaps)

Crawl Links Without Fetching Paywalled Content

If no sitemap exists, crawl just the surface-level links (navigation, category pages) which are usually freely accessible even on paywalled sites:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
import time, re

def crawl_links(start_url, max_pages=50, delay=1.5):
    """
    Crawl a site collecting URLs without hitting paywalled content.
    Only follows navigation/category pages, not article pages.
    """
    domain = urlparse(start_url).netloc
    visited = set()
    found_urls = set()
    queue = deque([start_url])

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
    }

    while queue and len(visited) < max_pages:
        url = queue.popleft()
        if url in visited:
            continue

        try:
            r = requests.get(url, headers=headers, timeout=10)
            visited.add(url)

            # Skip if paywalled (often returns 402 or redirect to login)
            if r.status_code in [402, 403, 401]:
                found_urls.add(url)  # Still record the URL
                continue

            if r.status_code != 200:
                continue

            soup = BeautifulSoup(r.content, 'html.parser')

            # Collect all links
            for a in soup.find_all('a', href=True):
                href = urljoin(url, a['href'])
                parsed = urlparse(href)

                # Stay on same domain, skip external
                if parsed.netloc != domain:
                    continue
                if parsed.scheme not in ('http', 'https'):
                    continue

                clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
                found_urls.add(clean_url)

                # Only queue what looks like a category/section page
                # (skip if it looks like a specific article with a date pattern)
                path = parsed.path
                if not re.search(r'\d{4}[-/]\d{2}[-/]\d{2}', path):
                    if clean_url not in visited:
                        queue.append(clean_url)

            time.sleep(delay)

        except Exception as e:
            print(f"Error on {url}: {e}")

    return list(found_urls)

urls = crawl_links("https://example.com", max_pages=20)
print(f"Found {len(urls)} unique URLs")

Using Google Search to Find Site URLs (Free)

For public content indexed by Google, this is often the best approach:

import requests, time
from bs4 import BeautifulSoup

def google_site_search(domain, query="", pages=3):
    """Get URLs from Google site: search"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    }

    all_urls = []

    for page in range(pages):
        start = page * 10
        search_q = f"site:{domain} {query}"
        google_url = f"https://www.google.com/search?q={requests.utils.quote(search_q)}&start={start}&num=10"

        r = requests.get(google_url, headers=headers)
        if r.status_code != 200:
            break

        soup = BeautifulSoup(r.text, 'html.parser')

        for result in soup.select('div.g a[href]'):
            href = result.get('href', '')
            if domain in href and not href.startswith('/'):
                all_urls.append(href)

        time.sleep(2)  # Respect rate limits

    return list(set(all_urls))

# Find all articles about "python" on a site
urls = google_site_search("techsite.com", query="python tutorial", pages=5)
print(f"Found {len(urls)} URLs via Google")

Check for URL Patterns via RSS/Atom Feeds

Many paywalled sites expose article lists via RSS — the content may be truncated but URLs are always there:

import requests, feedparser

def get_rss_urls(domain):
    """Try common RSS feed locations"""
    feed_paths = [
        "/feed", "/rss", "/feed.xml", "/rss.xml",
        "/feeds/posts/default", "/blog/feed", "/news/feed"
    ]

    for path in feed_paths:
        url = f"https://{domain}{path}"
        try:
            feed = feedparser.parse(url)
            if feed.entries:
                print(f"Found feed at: {url} ({len(feed.entries)} entries)")
                return [entry.link for entry in feed.entries]
        except:
            pass

    return []

# pip install feedparser
urls = get_rss_urls("techcrunch.com")
print(f"RSS gave us {len(urls)} article URLs")

Practical: Full Script to Extract All URLs

import requests, feedparser, re
from xml.etree import ElementTree
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def extract_all_urls(domain, limit=1000):
    results = set()
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}

    print(f"[1/4] Checking robots.txt for sitemaps...")
    try:
        r = requests.get(f"https://{domain}/robots.txt", headers=headers, timeout=10)
        sitemap_refs = re.findall(r'Sitemap:\s*(.+)', r.text, re.IGNORECASE)
        for sm_url in sitemap_refs:
            try:
                r2 = requests.get(sm_url.strip(), headers=headers, timeout=10)
                root = ElementTree.fromstring(r2.content)
                ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
                locs = [e.text for e in root.findall(".//sm:loc", ns) if e.text]
                results.update(locs)
                print(f"  Sitemap: {len(locs)} URLs from {sm_url[:60]}")
            except:
                pass
    except:
        pass

    print(f"[2/4] Checking RSS/Atom feeds...")
    for path in ["/feed", "/rss", "/feed.xml", "/rss.xml"]:
        try:
            feed = feedparser.parse(f"https://{domain}{path}")
            if feed.entries:
                urls = [e.link for e in feed.entries if hasattr(e, 'link')]
                results.update(urls)
                print(f"  Feed {path}: {len(urls)} URLs")
                break
        except:
            pass

    print(f"[3/4] Crawling homepage for links...")
    try:
        r = requests.get(f"https://{domain}/", headers=headers, timeout=10)
        soup = BeautifulSoup(r.content, 'html.parser')
        for a in soup.find_all('a', href=True):
            href = urljoin(f"https://{domain}/", a['href'])
            if domain in href:
                results.add(href.split('?')[0].split('#')[0])
    except:
        pass

    final = [u for u in results if domain in u][:limit]
    print(f"\nTotal unique URLs: {len(final)}")
    return final

if __name__ == "__main__":
    domain = "example.com"
    urls = extract_all_urls(domain)

    # Save to file
    with open("urls.txt", "w") as f:
        for url in sorted(urls):
            f.write(url + "\n")
    print(f"Saved {len(urls)} URLs to urls.txt")

Notes on Paywalled Content

Getting URLs is just the first step. For the actual content behind a paywall:

Academic papers: Try Unpaywall, Semantic Scholar, or the authors' personal sites
News articles: Google News often has cached/preview versions; 12ft.io for some sites
Product/pricing data: Often available in JSON API endpoints even when HTML is paywalled
Research reports: Abstract and metadata usually free; check if publisher offers free access after registration

The URL extraction techniques above work regardless of whether content is paywalled — sitemaps and robots.txt are always public.