Misinformation is everywhere. In this guide, we'll build a Python-based fact-checker that scrapes multiple sources to verify claims automatically.
How Automated Fact-Checking Works
Our fact-checker will:
- Parse a claim into searchable components
- Search multiple authoritative sources
- Compare findings against the claim
- Return a confidence score
Setting Up
pip install requests beautifulsoup4 newspaper3k
Step 1: Query Builder
import re
from typing import List
class QueryBuilder:
STOP_WORDS = {
"the", "a", "an", "is", "are", "was", "were", "be", "been",
"have", "has", "had", "do", "does", "did", "will", "would",
"could", "should", "that", "this", "what", "which", "who"
}
def build_queries(self, claim: str) -> List[str]:
words = re.findall(r'\b[a-zA-Z]+\b', claim.lower())
keywords = [w for w in words if w not in self.STOP_WORDS and len(w) > 2]
return [
" ".join(keywords),
" ".join(keywords[:5]),
f'"{claim[:80]}"',
]
Step 2: Multi-Source Scraper
import requests
from bs4 import BeautifulSoup
import time
class SourceScraper:
FACT_CHECK_SOURCES = [
{"name": "Snopes", "search": "https://www.snopes.com/?s={}"},
{"name": "PolitiFact", "search": "https://www.politifact.com/search/?q={}"},
{"name": "FactCheck.org", "search": "https://www.factcheck.org/?s={}"},
]
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "FactChecker/1.0 (Research Tool)"
})
def search_source(self, source, query):
url = source["search"].format(query)
try:
response = self.session.get(url, timeout=10)
soup = BeautifulSoup(response.text, "html.parser")
results = []
for article in soup.find_all("article")[:5]:
title_el = article.find(["h2", "h3", "h4"])
link_el = article.find("a", href=True)
if title_el:
results.append({
"source": source["name"],
"title": title_el.get_text(strip=True),
"url": link_el["href"] if link_el else "",
})
return results
except Exception as e:
print(f"Error searching {source['name']}: {e}")
return []
def search_all_sources(self, query):
all_results = []
for source in self.FACT_CHECK_SOURCES:
results = self.search_source(source, query)
all_results.extend(results)
time.sleep(2)
return all_results
Step 3: Evidence Analyzer
from collections import Counter
class EvidenceAnalyzer:
VERDICT_KEYWORDS = {
"true": ["true", "correct", "confirmed", "verified", "accurate"],
"false": ["false", "incorrect", "debunked", "fake", "misleading",
"pants on fire", "fabricated", "hoax"],
"mixed": ["partly true", "half true", "mixture", "mostly",
"context", "out of context"],
}
def analyze(self, results, claim):
verdicts = Counter()
for result in results:
title = result.get("title", "").lower()
for verdict, keywords in self.VERDICT_KEYWORDS.items():
if any(kw in title for kw in keywords):
verdicts[verdict] += 1
total = sum(verdicts.values())
if total == 0:
return {
"verdict": "unverified",
"confidence": 0,
"sources_checked": len(results)
}
top_verdict = verdicts.most_common(1)[0]
confidence = top_verdict[1] / total
return {
"verdict": top_verdict[0],
"confidence": round(confidence * 100, 1),
"sources_checked": len(results),
"breakdown": dict(verdicts)
}
Step 4: Putting It All Together
class FactChecker:
def __init__(self):
self.query_builder = QueryBuilder()
self.scraper = SourceScraper()
self.analyzer = EvidenceAnalyzer()
def check(self, claim):
print(f"Checking: {claim}")
queries = self.query_builder.build_queries(claim)
all_results = []
for query in queries[:2]:
results = self.scraper.search_all_sources(query)
all_results.extend(results)
seen_urls = set()
unique_results = []
for r in all_results:
if r["url"] not in seen_urls:
seen_urls.add(r["url"])
unique_results.append(r)
verdict = self.analyzer.analyze(unique_results, claim)
verdict["claim"] = claim
verdict["articles"] = unique_results[:5]
return verdict
checker = FactChecker()
result = checker.check("The Great Wall of China is visible from space")
print(f"Verdict: {result['verdict']} ({result['confidence']}% confidence)")
Scaling the Fact-Checker
For production use, ScraperAPI handles proxy rotation when checking multiple sources. ThorData provides residential IPs for news sites. Monitor with ScrapeOps.
Limitations
- Not a replacement for human judgment
- Only as good as the sources checked
- Nuanced claims may be oversimplified
- Always present confidence levels transparently
Conclusion
An automated fact-checker is a powerful screening tool. By scraping multiple authoritative sources, you can quickly assess claims at scale. Always show your sources and confidence levels — let users make the final judgment.
Top comments (0)