DEV Community

agenthustler
agenthustler

Posted on

Building an Automated Fact-Checker with Web Scraping

Misinformation is everywhere. In this guide, we'll build a Python-based fact-checker that scrapes multiple sources to verify claims automatically.

How Automated Fact-Checking Works

Our fact-checker will:

  1. Parse a claim into searchable components
  2. Search multiple authoritative sources
  3. Compare findings against the claim
  4. Return a confidence score

Setting Up

pip install requests beautifulsoup4 newspaper3k
Enter fullscreen mode Exit fullscreen mode

Step 1: Query Builder

import re
from typing import List

class QueryBuilder:
    STOP_WORDS = {
        "the", "a", "an", "is", "are", "was", "were", "be", "been",
        "have", "has", "had", "do", "does", "did", "will", "would",
        "could", "should", "that", "this", "what", "which", "who"
    }

    def build_queries(self, claim: str) -> List[str]:
        words = re.findall(r'\b[a-zA-Z]+\b', claim.lower())
        keywords = [w for w in words if w not in self.STOP_WORDS and len(w) > 2]
        return [
            " ".join(keywords),
            " ".join(keywords[:5]),
            f'"{claim[:80]}"',
        ]
Enter fullscreen mode Exit fullscreen mode

Step 2: Multi-Source Scraper

import requests
from bs4 import BeautifulSoup
import time

class SourceScraper:
    FACT_CHECK_SOURCES = [
        {"name": "Snopes", "search": "https://www.snopes.com/?s={}"},
        {"name": "PolitiFact", "search": "https://www.politifact.com/search/?q={}"},
        {"name": "FactCheck.org", "search": "https://www.factcheck.org/?s={}"},
    ]

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "FactChecker/1.0 (Research Tool)"
        })

    def search_source(self, source, query):
        url = source["search"].format(query)
        try:
            response = self.session.get(url, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")
            results = []
            for article in soup.find_all("article")[:5]:
                title_el = article.find(["h2", "h3", "h4"])
                link_el = article.find("a", href=True)
                if title_el:
                    results.append({
                        "source": source["name"],
                        "title": title_el.get_text(strip=True),
                        "url": link_el["href"] if link_el else "",
                    })
            return results
        except Exception as e:
            print(f"Error searching {source['name']}: {e}")
            return []

    def search_all_sources(self, query):
        all_results = []
        for source in self.FACT_CHECK_SOURCES:
            results = self.search_source(source, query)
            all_results.extend(results)
            time.sleep(2)
        return all_results
Enter fullscreen mode Exit fullscreen mode

Step 3: Evidence Analyzer

from collections import Counter

class EvidenceAnalyzer:
    VERDICT_KEYWORDS = {
        "true": ["true", "correct", "confirmed", "verified", "accurate"],
        "false": ["false", "incorrect", "debunked", "fake", "misleading",
                  "pants on fire", "fabricated", "hoax"],
        "mixed": ["partly true", "half true", "mixture", "mostly",
                  "context", "out of context"],
    }

    def analyze(self, results, claim):
        verdicts = Counter()
        for result in results:
            title = result.get("title", "").lower()
            for verdict, keywords in self.VERDICT_KEYWORDS.items():
                if any(kw in title for kw in keywords):
                    verdicts[verdict] += 1

        total = sum(verdicts.values())
        if total == 0:
            return {
                "verdict": "unverified",
                "confidence": 0,
                "sources_checked": len(results)
            }

        top_verdict = verdicts.most_common(1)[0]
        confidence = top_verdict[1] / total
        return {
            "verdict": top_verdict[0],
            "confidence": round(confidence * 100, 1),
            "sources_checked": len(results),
            "breakdown": dict(verdicts)
        }
Enter fullscreen mode Exit fullscreen mode

Step 4: Putting It All Together

class FactChecker:
    def __init__(self):
        self.query_builder = QueryBuilder()
        self.scraper = SourceScraper()
        self.analyzer = EvidenceAnalyzer()

    def check(self, claim):
        print(f"Checking: {claim}")
        queries = self.query_builder.build_queries(claim)

        all_results = []
        for query in queries[:2]:
            results = self.scraper.search_all_sources(query)
            all_results.extend(results)

        seen_urls = set()
        unique_results = []
        for r in all_results:
            if r["url"] not in seen_urls:
                seen_urls.add(r["url"])
                unique_results.append(r)

        verdict = self.analyzer.analyze(unique_results, claim)
        verdict["claim"] = claim
        verdict["articles"] = unique_results[:5]
        return verdict

checker = FactChecker()
result = checker.check("The Great Wall of China is visible from space")
print(f"Verdict: {result['verdict']} ({result['confidence']}% confidence)")
Enter fullscreen mode Exit fullscreen mode

Scaling the Fact-Checker

For production use, ScraperAPI handles proxy rotation when checking multiple sources. ThorData provides residential IPs for news sites. Monitor with ScrapeOps.

Limitations

  • Not a replacement for human judgment
  • Only as good as the sources checked
  • Nuanced claims may be oversimplified
  • Always present confidence levels transparently

Conclusion

An automated fact-checker is a powerful screening tool. By scraping multiple authoritative sources, you can quickly assess claims at scale. Always show your sources and confidence levels — let users make the final judgment.

Top comments (0)