How to Detect Content Plagiarism with Web Scraping in Python
Content theft is rampant. Your blog posts, product descriptions, and marketing copy get scraped and republished constantly. Let's build a plagiarism detector that finds copies of your content across the web.
How It Works
- Extract text fingerprints from your content
- Search for those fingerprints online
- Scrape matching pages to verify
- Score similarity and generate reports
Setup
import requests
from bs4 import BeautifulSoup
import hashlib
import re
from difflib import SequenceMatcher
from datetime import datetime
PROXY_URL = "https://api.scraperapi.com"
API_KEY = "YOUR_SCRAPERAPI_KEY"
Searching across many websites requires reliable proxy rotation. ScraperAPI handles this at scale.
Text Fingerprinting
def generate_fingerprints(text, window=5):
words = text.lower().split()
words = [w for w in words if len(w) > 3]
fingerprints = []
for i in range(0, len(words) - window + 1, window // 2):
phrase = " ".join(words[i:i + window])
fp_hash = hashlib.md5(phrase.encode()).hexdigest()[:12]
fingerprints.append({
"phrase": phrase,
"hash": fp_hash,
"position": i
})
return fingerprints
def select_search_phrases(fingerprints, count=5):
step = max(1, len(fingerprints) // count)
selected = fingerprints[::step][:count]
return [f['phrase'] for f in selected]
Searching for Copies
def search_for_plagiarism(phrases):
all_results = []
for phrase in phrases:
query = f'"{phrase}"'
params = {
"api_key": API_KEY,
"url": f"https://www.google.com/search?q={requests.utils.quote(query)}&num=10"
}
response = requests.get(PROXY_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
for result in soup.select(".g"):
link = result.select_one("a")
title = result.select_one("h3")
snippet = result.select_one(".VwiC3b")
if link and link.get("href"):
all_results.append({
"url": link["href"],
"title": title.text if title else "",
"snippet": snippet.text if snippet else "",
"matched_phrase": phrase
})
import time; time.sleep(5)
unique = {r["url"]: r for r in all_results}
return list(unique.values())
Verifying Matches
def verify_plagiarism(original_text, suspect_urls):
verified = []
for suspect in suspect_urls:
params = {
"api_key": API_KEY,
"url": suspect["url"],
"render": "true"
}
try:
response = requests.get(PROXY_URL, params=params, timeout=30)
soup = BeautifulSoup(response.text, "html.parser")
for tag in soup.select("script, style, nav, header, footer"):
tag.decompose()
page_text = soup.get_text(separator=" ", strip=True)
similarity = calculate_similarity(original_text, page_text)
if similarity > 0.15:
verified.append({
"url": suspect["url"],
"title": suspect["title"],
"similarity": round(similarity * 100, 1),
"matched_phrases": count_phrase_matches(original_text, page_text),
"verified_at": datetime.now().isoformat()
})
except Exception:
continue
import time; time.sleep(3)
return sorted(verified, key=lambda x: x["similarity"], reverse=True)
def calculate_similarity(text1, text2):
t1 = normalize_text(text1)[:5000]
t2 = normalize_text(text2)[:5000]
return SequenceMatcher(None, t1, t2).ratio()
def normalize_text(text):
text = re.sub(r'\s+', ' ', text.lower())
text = re.sub(r'[^a-z0-9 ]', '', text)
return text.strip()
def count_phrase_matches(original, suspect):
words = original.lower().split()
matches = 0
for i in range(0, len(words) - 5):
phrase = " ".join(words[i:i+5])
if phrase in suspect.lower():
matches += 1
return matches
Generating Reports
def plagiarism_report(original_url, verified_matches):
report = f"# Plagiarism Report\n"
report += f"**Original:** {original_url}\n"
report += f"**Scan Date:** {datetime.now().date()}\n"
report += f"**Matches Found:** {len(verified_matches)}\n\n"
for i, match in enumerate(verified_matches, 1):
severity = "HIGH" if match["similarity"] > 50 else "MEDIUM" if match["similarity"] > 25 else "LOW"
report += f"## Match {i} [{severity}]\n"
report += f"- **URL:** {match['url']}\n"
report += f"- **Similarity:** {match['similarity']}%\n"
report += f"- **Phrase Matches:** {match['matched_phrases']}\n\n"
return report
# Full pipeline
original = "Your original article text goes here..."
fingerprints = generate_fingerprints(original)
search_phrases = select_search_phrases(fingerprints)
suspects = search_for_plagiarism(search_phrases)
verified = verify_plagiarism(original, suspects)
report = plagiarism_report("https://yourblog.com/article", verified)
print(report)
Infrastructure
Plagiarism detection requires broad, reliable web access:
- ScraperAPI — search engine scraping with built-in CAPTCHA handling
- ThorData — residential proxies to access diverse websites without blocks
- ScrapeOps — monitor your plagiarism scanning pipeline
Conclusion
Protecting your content starts with knowing where it's been copied. This pipeline automates the detection process and gives you actionable data — URLs, similarity scores, and evidence — to pursue takedowns or adjust your content strategy.
Top comments (0)