Building a Real-Time News Deduplication Engine with Python

#python #tutorial #webdev #programming

News aggregators show 80% redundant content. A dedup engine clusters duplicate stories for a clean feed.

MinHash + LSH

import hashlib, re
from collections import defaultdict

class MinHashLSH:
    def __init__(self, n=128):
        self.n = n
        self.h = [(hash(f"a_{i}")%(2**31-1),hash(f"b_{i}")%(2**31-1)) for i in range(n)]
        self.bands = max(1, n//4)
        self.rows = n//self.bands
        self.bkts = defaultdict(list)

    def shingle(self, text, k=3):
        w = re.sub(r'[^a-z0-9 ]','',text.lower()).split()
        return set(' '.join(w[i:i+k]) for i in range(len(w)-k+1))

    def minhash(self, sh):
        return [min((a*hash(s)+b)%(2**32-1) for s in sh) if sh else 0 for a,b in self.h]

    def add(self, did, text):
        sh = self.shingle(text)
        if not sh: return
        sig = self.minhash(sh)
        for i in range(self.bands):
            band = tuple(sig[i*self.rows:(i+1)*self.rows])
            self.bkts[f"{i}_{hashlib.md5(str(band).encode()).hexdigest()}"].append(did)

    def find(self, text):
        sh = self.shingle(text)
        if not sh: return []
        sig = self.minhash(sh)
        cands = set()
        for i in range(self.bands):
            band = tuple(sig[i*self.rows:(i+1)*self.rows])
            cands.update(self.bkts.get(f"{i}_{hashlib.md5(str(band).encode()).hexdigest()}",  []))
        return list(cands)

Dedup Engine

class Dedup:
    def __init__(self):
        self.lsh = MinHashLSH(128)
        self.arts = {}
        self.clusters = defaultdict(list)
        self.cid = 0

    def process(self, article):
        did = article.get("id") or hashlib.md5(article["url"].encode()).hexdigest()
        text = f"{article['title']} {article.get('desc','')}"
        dups = self.lsh.find(text)
        matched = False
        for d in dups:
            if d in self.arts:
                c = self.arts[d].get("cid")
                if c is not None:
                    article["cid"], article["dup"] = c, True
                    self.clusters[c].append(did); matched = True; break
        if not matched:
            self.cid += 1
            article["cid"], article["dup"] = self.cid, False
            self.clusters[self.cid].append(did)
        self.arts[did] = article
        self.lsh.add(did, text)
        return article

    def feed(self):
        out, seen = [], set()
        for did, a in sorted(self.arts.items(), key=lambda x:x[1].get("pub",""), reverse=True):
            c = a.get("cid")
            if c not in seen:
                seen.add(c); a["sources"] = len(self.clusters.get(c,[])); out.append(a)
        return out

Live Feed

import requests
def news():
    return [{"id":h["objectID"],"title":h.get("title",""),"url":h.get("url",""),
             "desc":h.get("title",""),"pub":h.get("created_at","")}
            for h in requests.get("https://hn.algolia.com/api/v1/search?tags=story&hitsPerPage=50"
            ).json().get("hits",[])]

d = Dedup()
for a in news():
    r = d.process(a)
    print(f"[{'DUP' if r.get('dup') else 'NEW'}] {r['title'][:60]}")
print(f"\n{len(d.arts)} total -> {len(d.clusters)} unique")
for a in d.feed()[:10]:
    print(f"  [{a['sources']} src] {a['title'][:60]}")

Scaling

ScraperAPI bypasses paywalls. ThorData for IP diversity. ScrapeOps for monitoring.

Performance

MinHash+LSH gives O(1) dedup. Typical news dedup rate: 60-80%.

DEV Community