DEV Community

agenthustler
agenthustler

Posted on

Scraping Patent Citations for Prior Art Research with Python

Patent citation analysis reveals technological lineage. Automated prior art search saves thousands in legal fees.

USPTO PatentsView API

import requests, time
from collections import defaultdict

class PatentScraper:
    def __init__(self):
        self.s = requests.Session()
        self.s.headers["User-Agent"] = "PatentResearch/1.0"

    def search(self, query, n=50):
        r = self.s.post("https://api.patentsview.org/patents/query", json={
            "q":{"_text_any":{"patent_abstract":query}},
            "f":["patent_number","patent_title","patent_date","patent_abstract",
                 "assignee_organization","cited_patent_number","citedby_patent_number"],
            "o":{"page":1,"per_page":n},"s":[{"patent_date":"desc"}]})
        return r.json().get("patents",[]) if r.status_code==200 else []

    def details(self, pn):
        r = self.s.post("https://api.patentsview.org/patents/query", json={
            "q":{"patent_number":pn},
            "f":["patent_number","patent_title","patent_date","patent_abstract",
                 "cited_patent_number","citedby_patent_number","assignee_organization"]})
        ps = r.json().get("patents",[]) if r.status_code==200 else []
        return ps[0] if ps else None
Enter fullscreen mode Exit fullscreen mode

Citation Chain Builder

    def chain(self, pn, depth=2):
        visited = set()
        result = {"nodes":[],"edges":[]}
        def go(p, d):
            if p in visited or d>depth: return
            visited.add(p)
            det = self.details(p)
            if not det: return
            asgn = det.get("assignees",[{}])
            result["nodes"].append({"id":p,"title":det.get("patent_title",""),
                "assignee":asgn[0].get("assignee_organization","?") if asgn else "?"})
            for c in det.get("cited_patents",[])[:10]:
                cpn = c.get("cited_patent_number")
                if cpn:
                    result["edges"].append({"from":p,"to":cpn})
                    time.sleep(0.3); go(cpn, d+1)
        go(pn, 0)
        return result

    def prior_art(self, desc, before=None):
        qw = set(desc.lower().split())
        results = []
        for p in self.search(desc, 30):
            pd = p.get("patent_date","9999")
            if before and pd >= before: continue
            aw = set(p.get("patent_abstract","").lower().split())
            rel = len(qw&aw)/len(qw)*100 if qw else 0
            results.append({"number":p["patent_number"],"title":p["patent_title"],
                           "date":pd,"relevance":rel})
        return sorted(results, key=lambda x:x["relevance"], reverse=True)[:15]

s = PatentScraper()
c = s.chain("11023456", 1)
print(f"Nodes:{len(c['nodes'])} Edges:{len(c['edges'])}")
for p in s.prior_art("machine learning image classification")[:5]:
    print(f"  [{p['relevance']:.0f}%] {p['number']} - {p['title']}")
Enter fullscreen mode Exit fullscreen mode

Scaling

ScraperAPI for Google Patents. ThorData for international offices. ScrapeOps for monitoring.

Uses

Due diligence, competitive intel, tech forecasting, licensing opportunities.

Top comments (0)