How to Detect Content Plagiarism with Web Scraping in Python

#python #tutorial #webdev #programming

How to Detect Content Plagiarism with Web Scraping in Python

Content theft is rampant. Your blog posts, product descriptions, and marketing copy get scraped and republished constantly. Let's build a plagiarism detector that finds copies of your content across the web.

How It Works

Extract text fingerprints from your content
Search for those fingerprints online
Scrape matching pages to verify
Score similarity and generate reports

Setup

import requests
from bs4 import BeautifulSoup
import hashlib
import re
from difflib import SequenceMatcher
from datetime import datetime

PROXY_URL = "https://api.scraperapi.com"
API_KEY = "YOUR_SCRAPERAPI_KEY"

Searching across many websites requires reliable proxy rotation. ScraperAPI handles this at scale.

Text Fingerprinting

def generate_fingerprints(text, window=5):
    words = text.lower().split()
    words = [w for w in words if len(w) > 3]

    fingerprints = []
    for i in range(0, len(words) - window + 1, window // 2):
        phrase = " ".join(words[i:i + window])
        fp_hash = hashlib.md5(phrase.encode()).hexdigest()[:12]
        fingerprints.append({
            "phrase": phrase,
            "hash": fp_hash,
            "position": i
        })
    return fingerprints

def select_search_phrases(fingerprints, count=5):
    step = max(1, len(fingerprints) // count)
    selected = fingerprints[::step][:count]
    return [f['phrase'] for f in selected]

Searching for Copies

def search_for_plagiarism(phrases):
    all_results = []

    for phrase in phrases:
        query = f'"{phrase}"'
        params = {
            "api_key": API_KEY,
            "url": f"https://www.google.com/search?q={requests.utils.quote(query)}&num=10"
        }
        response = requests.get(PROXY_URL, params=params)
        soup = BeautifulSoup(response.text, "html.parser")

        for result in soup.select(".g"):
            link = result.select_one("a")
            title = result.select_one("h3")
            snippet = result.select_one(".VwiC3b")

            if link and link.get("href"):
                all_results.append({
                    "url": link["href"],
                    "title": title.text if title else "",
                    "snippet": snippet.text if snippet else "",
                    "matched_phrase": phrase
                })
        import time; time.sleep(5)

    unique = {r["url"]: r for r in all_results}
    return list(unique.values())

Verifying Matches

def verify_plagiarism(original_text, suspect_urls):
    verified = []

    for suspect in suspect_urls:
        params = {
            "api_key": API_KEY,
            "url": suspect["url"],
            "render": "true"
        }
        try:
            response = requests.get(PROXY_URL, params=params, timeout=30)
            soup = BeautifulSoup(response.text, "html.parser")

            for tag in soup.select("script, style, nav, header, footer"):
                tag.decompose()

            page_text = soup.get_text(separator=" ", strip=True)
            similarity = calculate_similarity(original_text, page_text)

            if similarity > 0.15:
                verified.append({
                    "url": suspect["url"],
                    "title": suspect["title"],
                    "similarity": round(similarity * 100, 1),
                    "matched_phrases": count_phrase_matches(original_text, page_text),
                    "verified_at": datetime.now().isoformat()
                })
        except Exception:
            continue
        import time; time.sleep(3)

    return sorted(verified, key=lambda x: x["similarity"], reverse=True)

def calculate_similarity(text1, text2):
    t1 = normalize_text(text1)[:5000]
    t2 = normalize_text(text2)[:5000]
    return SequenceMatcher(None, t1, t2).ratio()

def normalize_text(text):
    text = re.sub(r'\s+', ' ', text.lower())
    text = re.sub(r'[^a-z0-9 ]', '', text)
    return text.strip()

def count_phrase_matches(original, suspect):
    words = original.lower().split()
    matches = 0
    for i in range(0, len(words) - 5):
        phrase = " ".join(words[i:i+5])
        if phrase in suspect.lower():
            matches += 1
    return matches

Generating Reports

def plagiarism_report(original_url, verified_matches):
    report = f"# Plagiarism Report\n"
    report += f"**Original:** {original_url}\n"
    report += f"**Scan Date:** {datetime.now().date()}\n"
    report += f"**Matches Found:** {len(verified_matches)}\n\n"

    for i, match in enumerate(verified_matches, 1):
        severity = "HIGH" if match["similarity"] > 50 else "MEDIUM" if match["similarity"] > 25 else "LOW"
        report += f"## Match {i} [{severity}]\n"
        report += f"- **URL:** {match['url']}\n"
        report += f"- **Similarity:** {match['similarity']}%\n"
        report += f"- **Phrase Matches:** {match['matched_phrases']}\n\n"

    return report

# Full pipeline
original = "Your original article text goes here..."
fingerprints = generate_fingerprints(original)
search_phrases = select_search_phrases(fingerprints)
suspects = search_for_plagiarism(search_phrases)
verified = verify_plagiarism(original, suspects)
report = plagiarism_report("https://yourblog.com/article", verified)
print(report)

Infrastructure

Plagiarism detection requires broad, reliable web access:

ScraperAPI — search engine scraping with built-in CAPTCHA handling
ThorData — residential proxies to access diverse websites without blocks
ScrapeOps — monitor your plagiarism scanning pipeline

Conclusion

Protecting your content starts with knowing where it's been copied. This pipeline automates the detection process and gives you actionable data — URLs, similarity scores, and evidence — to pursue takedowns or adjust your content strategy.

DEV Community

How to Detect Content Plagiarism with Web Scraping in Python

How to Detect Content Plagiarism with Web Scraping in Python

How It Works

Setup

Text Fingerprinting

Searching for Copies

Verifying Matches

Generating Reports

Infrastructure

Conclusion

Top comments (0)