Patent data is a goldmine for competitive intelligence, research, and innovation tracking. This guide shows you how to build scrapers for the three major patent databases.
Why Scrape Patent Data?
- Track competitor R&D activity
- Identify technology trends before they hit the market
- Find prior art for patent applications
- Build innovation intelligence dashboards
USPTO: United States Patent Office
The USPTO provides a bulk data API and a search interface:
pip install requests beautifulsoup4 lxml
Using the USPTO Open Data API
import requests, time
class USPTOScraper:
BASE_URL = "https://developer.uspto.gov/ibd-api/v1/application/publications"
def __init__(self, delay=1.0):
self.delay = delay
self.session = requests.Session()
def search_patents(self, query, start=0, rows=25):
params = {"searchText": query, "start": start, "rows": rows}
time.sleep(self.delay)
response = self.session.get(self.BASE_URL, params=params)
response.raise_for_status()
return response.json()
def search_all(self, query, max_results=100):
all_results = []
start = 0
while start < max_results:
data = self.search_patents(query, start=start)
results = data.get("results", [])
if not results:
break
all_results.extend(results)
start += len(results)
print(f"Fetched {len(all_results)} patents...")
return all_results
scraper = USPTOScraper()
ai_patents = scraper.search_all("artificial intelligence", max_results=50)
for patent in ai_patents[:5]:
print(patent.get('inventionTitle', 'N/A'))
EPO: European Patent Office
The EPO provides the Open Patent Services (OPS) API:
import requests, base64
class EPOScraper:
AUTH_URL = "https://ops.epo.org/3.2/auth/accesstoken"
SEARCH_URL = "https://ops.epo.org/3.2/rest-services/published-data/search"
def __init__(self, consumer_key, consumer_secret):
self.token = self._authenticate(consumer_key, consumer_secret)
self.session = requests.Session()
self.session.headers["Authorization"] = f"Bearer {self.token}"
def _authenticate(self, key, secret):
credentials = base64.b64encode(f"{key}:{secret}".encode()).decode()
response = requests.post(
self.AUTH_URL,
headers={"Authorization": f"Basic {credentials}"},
data={"grant_type": "client_credentials"}
)
return response.json()["access_token"]
def search(self, query, range_begin=1, range_end=25):
headers = {"Accept": "application/json"}
params = {"q": query, "Range": f"{range_begin}-{range_end}"}
response = self.session.get(self.SEARCH_URL, headers=headers, params=params)
return response.json()
Google Patents: The Aggregator
Google Patents aggregates from multiple sources. Scrape with Playwright:
from playwright.sync_api import sync_playwright
import time
def scrape_google_patents(query, max_results=20):
patents = []
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True)
page = browser.new_page()
search_url = f"https://patents.google.com/?q={query}&oq={query}"
page.goto(search_url, wait_until="networkidle")
time.sleep(3)
results = page.query_selector_all("search-result-item")
for result in results[:max_results]:
title_el = result.query_selector("h3")
id_el = result.query_selector(".result-title span")
abstract_el = result.query_selector(".abstract")
patents.append({
"title": title_el.inner_text() if title_el else "",
"patent_id": id_el.inner_text() if id_el else "",
"abstract": abstract_el.inner_text() if abstract_el else ""
})
browser.close()
return patents
Building a Unified Patent Tracker
import csv
from datetime import datetime
class PatentTracker:
def __init__(self):
self.patents = []
def add_results(self, results, source):
for r in results:
r["source"] = source
r["scraped_at"] = datetime.now().isoformat()
self.patents.append(r)
def export_csv(self, filename="patents.csv"):
if not self.patents:
return
keys = self.patents[0].keys()
with open(filename, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(self.patents)
print(f"Exported {len(self.patents)} patents to {filename}")
def find_duplicates(self):
titles = {}
for p in self.patents:
title = p.get("title", "").lower()
titles.setdefault(title, []).append(p.get("source"))
return {t: s for t, s in titles.items() if len(s) > 1}
Scaling with Proxies
For large-scale patent research, ScraperAPI handles rotation automatically, while ThorData offers residential IPs for sites that block datacenter ranges.
Monitoring
Use ScrapeOps to monitor your patent scrapers — track success rates across all three sources and get alerted when APIs change.
Conclusion
Patent databases are among the most structured and valuable data sources available. Combine USPTO, EPO, and Google Patents data for comprehensive coverage. Use official APIs where available, scrape where necessary, and always respect rate limits.
Top comments (0)