How to Extract URLs in Bulk From a Site That May Be Paywalled
You need a list of article URLs, product pages, or document links from a site — but the content might be behind a paywall, login, or access restriction. Here's how to get the URLs (even if not the full content) without hitting paywalls on every request.
Strategy: Sitemap First, Crawl Second
Most sites publish their URL structure in sitemaps even when the content is paywalled. This is free — no login needed:
import requests
from xml.etree import ElementTree
import urllib.parse
def get_sitemap_urls(domain):
"""Extract all URLs from a site's sitemap"""
urls = []
# Try common sitemap locations
sitemap_paths = [
"/sitemap.xml",
"/sitemap_index.xml",
"/sitemaps/sitemap.xml",
"/news-sitemap.xml",
]
headers = {"User-Agent": "Mozilla/5.0 (compatible; SitemapBot/1.0)"}
for path in sitemap_paths:
url = f"https://{domain}{path}"
r = requests.get(url, headers=headers, timeout=10)
if r.status_code != 200:
continue
# Parse XML sitemap
root = ElementTree.fromstring(r.content)
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
# Handle sitemap index (nested sitemaps)
for sitemap in root.findall(".//sm:sitemap/sm:loc", ns):
child_urls = get_sitemap_urls_from(sitemap.text, headers)
urls.extend(child_urls)
# Handle direct URL list
for loc in root.findall(".//sm:url/sm:loc", ns):
urls.append(loc.text)
if urls:
break
return urls
def get_sitemap_urls_from(sitemap_url, headers):
"""Parse a single sitemap XML file"""
r = requests.get(sitemap_url, headers=headers, timeout=10)
if r.status_code != 200:
return []
root = ElementTree.fromstring(r.content)
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
return [loc.text for loc in root.findall(".//sm:url/sm:loc", ns)]
# Usage
domain = "example.com"
all_urls = get_sitemap_urls(domain)
print(f"Found {len(all_urls)} URLs")
for url in all_urls[:10]:
print(url)
Find Sitemaps via robots.txt
The standard place to find sitemap locations:
import requests, re
def find_sitemaps_from_robots(domain):
r = requests.get(f"https://{domain}/robots.txt", timeout=10,
headers={"User-Agent": "Mozilla/5.0"})
if r.status_code != 200:
return []
# Extract Sitemap: lines
sitemaps = re.findall(r'^Sitemap:\s*(.+)$', r.text, re.MULTILINE)
return [s.strip() for s in sitemaps]
sitemaps = find_sitemaps_from_robots("nytimes.com")
print("Sitemaps:", sitemaps)
Crawl Links Without Fetching Paywalled Content
If no sitemap exists, crawl just the surface-level links (navigation, category pages) which are usually freely accessible even on paywalled sites:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
import time, re
def crawl_links(start_url, max_pages=50, delay=1.5):
"""
Crawl a site collecting URLs without hitting paywalled content.
Only follows navigation/category pages, not article pages.
"""
domain = urlparse(start_url).netloc
visited = set()
found_urls = set()
queue = deque([start_url])
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
}
while queue and len(visited) < max_pages:
url = queue.popleft()
if url in visited:
continue
try:
r = requests.get(url, headers=headers, timeout=10)
visited.add(url)
# Skip if paywalled (often returns 402 or redirect to login)
if r.status_code in [402, 403, 401]:
found_urls.add(url) # Still record the URL
continue
if r.status_code != 200:
continue
soup = BeautifulSoup(r.content, 'html.parser')
# Collect all links
for a in soup.find_all('a', href=True):
href = urljoin(url, a['href'])
parsed = urlparse(href)
# Stay on same domain, skip external
if parsed.netloc != domain:
continue
if parsed.scheme not in ('http', 'https'):
continue
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
found_urls.add(clean_url)
# Only queue what looks like a category/section page
# (skip if it looks like a specific article with a date pattern)
path = parsed.path
if not re.search(r'\d{4}[-/]\d{2}[-/]\d{2}', path):
if clean_url not in visited:
queue.append(clean_url)
time.sleep(delay)
except Exception as e:
print(f"Error on {url}: {e}")
return list(found_urls)
urls = crawl_links("https://example.com", max_pages=20)
print(f"Found {len(urls)} unique URLs")
Using Google Search to Find Site URLs (Free)
For public content indexed by Google, this is often the best approach:
import requests, time
from bs4 import BeautifulSoup
def google_site_search(domain, query="", pages=3):
"""Get URLs from Google site: search"""
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
all_urls = []
for page in range(pages):
start = page * 10
search_q = f"site:{domain} {query}"
google_url = f"https://www.google.com/search?q={requests.utils.quote(search_q)}&start={start}&num=10"
r = requests.get(google_url, headers=headers)
if r.status_code != 200:
break
soup = BeautifulSoup(r.text, 'html.parser')
for result in soup.select('div.g a[href]'):
href = result.get('href', '')
if domain in href and not href.startswith('/'):
all_urls.append(href)
time.sleep(2) # Respect rate limits
return list(set(all_urls))
# Find all articles about "python" on a site
urls = google_site_search("techsite.com", query="python tutorial", pages=5)
print(f"Found {len(urls)} URLs via Google")
Check for URL Patterns via RSS/Atom Feeds
Many paywalled sites expose article lists via RSS — the content may be truncated but URLs are always there:
import requests, feedparser
def get_rss_urls(domain):
"""Try common RSS feed locations"""
feed_paths = [
"/feed", "/rss", "/feed.xml", "/rss.xml",
"/feeds/posts/default", "/blog/feed", "/news/feed"
]
for path in feed_paths:
url = f"https://{domain}{path}"
try:
feed = feedparser.parse(url)
if feed.entries:
print(f"Found feed at: {url} ({len(feed.entries)} entries)")
return [entry.link for entry in feed.entries]
except:
pass
return []
# pip install feedparser
urls = get_rss_urls("techcrunch.com")
print(f"RSS gave us {len(urls)} article URLs")
Practical: Full Script to Extract All URLs
import requests, feedparser, re
from xml.etree import ElementTree
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def extract_all_urls(domain, limit=1000):
results = set()
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}
print(f"[1/4] Checking robots.txt for sitemaps...")
try:
r = requests.get(f"https://{domain}/robots.txt", headers=headers, timeout=10)
sitemap_refs = re.findall(r'Sitemap:\s*(.+)', r.text, re.IGNORECASE)
for sm_url in sitemap_refs:
try:
r2 = requests.get(sm_url.strip(), headers=headers, timeout=10)
root = ElementTree.fromstring(r2.content)
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
locs = [e.text for e in root.findall(".//sm:loc", ns) if e.text]
results.update(locs)
print(f" Sitemap: {len(locs)} URLs from {sm_url[:60]}")
except:
pass
except:
pass
print(f"[2/4] Checking RSS/Atom feeds...")
for path in ["/feed", "/rss", "/feed.xml", "/rss.xml"]:
try:
feed = feedparser.parse(f"https://{domain}{path}")
if feed.entries:
urls = [e.link for e in feed.entries if hasattr(e, 'link')]
results.update(urls)
print(f" Feed {path}: {len(urls)} URLs")
break
except:
pass
print(f"[3/4] Crawling homepage for links...")
try:
r = requests.get(f"https://{domain}/", headers=headers, timeout=10)
soup = BeautifulSoup(r.content, 'html.parser')
for a in soup.find_all('a', href=True):
href = urljoin(f"https://{domain}/", a['href'])
if domain in href:
results.add(href.split('?')[0].split('#')[0])
except:
pass
final = [u for u in results if domain in u][:limit]
print(f"\nTotal unique URLs: {len(final)}")
return final
if __name__ == "__main__":
domain = "example.com"
urls = extract_all_urls(domain)
# Save to file
with open("urls.txt", "w") as f:
for url in sorted(urls):
f.write(url + "\n")
print(f"Saved {len(urls)} URLs to urls.txt")
Notes on Paywalled Content
Getting URLs is just the first step. For the actual content behind a paywall:
- Academic papers: Try Unpaywall, Semantic Scholar, or the authors' personal sites
- News articles: Google News often has cached/preview versions; 12ft.io for some sites
- Product/pricing data: Often available in JSON API endpoints even when HTML is paywalled
- Research reports: Abstract and metadata usually free; check if publisher offers free access after registration
The URL extraction techniques above work regardless of whether content is paywalled — sitemaps and robots.txt are always public.
Related Reading
- Web Scraping Without Getting Banned in 2026 — Anti-bot bypass techniques once you have the URLs
- curl_cffi Stopped Working? Here's What to Try Next — TLS fingerprint issues when accessing paywalled content
Take the next step
Skip the setup. Production-ready tools for bulk URL extraction:
Apify Scrapers Bundle — $29 one-time
Instant download. Documented. Ready to deploy.
Top comments (0)