News aggregators show 80% redundant content. A dedup engine clusters duplicate stories for a clean feed.
MinHash + LSH
import hashlib, re
from collections import defaultdict
class MinHashLSH:
def __init__(self, n=128):
self.n = n
self.h = [(hash(f"a_{i}")%(2**31-1),hash(f"b_{i}")%(2**31-1)) for i in range(n)]
self.bands = max(1, n//4)
self.rows = n//self.bands
self.bkts = defaultdict(list)
def shingle(self, text, k=3):
w = re.sub(r'[^a-z0-9 ]','',text.lower()).split()
return set(' '.join(w[i:i+k]) for i in range(len(w)-k+1))
def minhash(self, sh):
return [min((a*hash(s)+b)%(2**32-1) for s in sh) if sh else 0 for a,b in self.h]
def add(self, did, text):
sh = self.shingle(text)
if not sh: return
sig = self.minhash(sh)
for i in range(self.bands):
band = tuple(sig[i*self.rows:(i+1)*self.rows])
self.bkts[f"{i}_{hashlib.md5(str(band).encode()).hexdigest()}"].append(did)
def find(self, text):
sh = self.shingle(text)
if not sh: return []
sig = self.minhash(sh)
cands = set()
for i in range(self.bands):
band = tuple(sig[i*self.rows:(i+1)*self.rows])
cands.update(self.bkts.get(f"{i}_{hashlib.md5(str(band).encode()).hexdigest()}", []))
return list(cands)
Dedup Engine
class Dedup:
def __init__(self):
self.lsh = MinHashLSH(128)
self.arts = {}
self.clusters = defaultdict(list)
self.cid = 0
def process(self, article):
did = article.get("id") or hashlib.md5(article["url"].encode()).hexdigest()
text = f"{article['title']} {article.get('desc','')}"
dups = self.lsh.find(text)
matched = False
for d in dups:
if d in self.arts:
c = self.arts[d].get("cid")
if c is not None:
article["cid"], article["dup"] = c, True
self.clusters[c].append(did); matched = True; break
if not matched:
self.cid += 1
article["cid"], article["dup"] = self.cid, False
self.clusters[self.cid].append(did)
self.arts[did] = article
self.lsh.add(did, text)
return article
def feed(self):
out, seen = [], set()
for did, a in sorted(self.arts.items(), key=lambda x:x[1].get("pub",""), reverse=True):
c = a.get("cid")
if c not in seen:
seen.add(c); a["sources"] = len(self.clusters.get(c,[])); out.append(a)
return out
Live Feed
import requests
def news():
return [{"id":h["objectID"],"title":h.get("title",""),"url":h.get("url",""),
"desc":h.get("title",""),"pub":h.get("created_at","")}
for h in requests.get("https://hn.algolia.com/api/v1/search?tags=story&hitsPerPage=50"
).json().get("hits",[])]
d = Dedup()
for a in news():
r = d.process(a)
print(f"[{'DUP' if r.get('dup') else 'NEW'}] {r['title'][:60]}")
print(f"\n{len(d.arts)} total -> {len(d.clusters)} unique")
for a in d.feed()[:10]:
print(f" [{a['sources']} src] {a['title'][:60]}")
Scaling
ScraperAPI bypasses paywalls. ThorData for IP diversity. ScrapeOps for monitoring.
Performance
MinHash+LSH gives O(1) dedup. Typical news dedup rate: 60-80%.
Top comments (0)