How to Build an Influencer Vetting Tool with Social Scraping
Brands waste billions on influencer fraud — fake followers, bot engagement, and inflated metrics. Building an automated vetting tool that scrapes public social data can detect these patterns before money changes hands. Here's how to build one with Python.
The Influencer Fraud Problem
Studies estimate that 15-25% of influencer followers are fake. Engagement pods artificially inflate likes and comments. Without data-driven vetting, brands rely on self-reported metrics that are easily manipulated.
Architecture
Our vetting tool analyzes three dimensions:
- Follower quality — bot detection via profile analysis
- Engagement authenticity — detecting pods and bought engagement
- Content consistency — verifying claimed niche and posting patterns
Scraping Public Profile Data
We focus on publicly available data only. ScraperAPI handles the complexity of scraping social platforms through proper proxy rotation:
import requests
from bs4 import BeautifulSoup
import re
import json
from datetime import datetime
SCRAPER_API_KEY = "YOUR_KEY"
def scrape_public_profile(platform_url):
response = requests.get(
"http://api.scraperapi.com",
params={
"api_key": SCRAPER_API_KEY,
"url": platform_url,
"render": "true"
},
timeout=60
)
soup = BeautifulSoup(response.text, "html.parser")
profile = {"url": platform_url, "scraped_at": datetime.utcnow().isoformat()}
for meta in soup.find_all("meta"):
prop = meta.get("property", meta.get("name", ""))
content = meta.get("content", "")
if "title" in prop:
profile["name"] = content
elif "description" in prop:
profile["bio"] = content
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string)
if isinstance(data, dict):
profile["structured_data"] = data
except (json.JSONDecodeError, TypeError):
pass
return profile
Engagement Analysis
Authentic engagement follows predictable statistical patterns. Fake engagement doesn't:
import numpy as np
class EngagementAnalyzer:
def analyze_engagement_distribution(self, posts):
if not posts:
return {"verdict": "INSUFFICIENT_DATA"}
likes = [p.get("likes", 0) for p in posts if p.get("likes") is not None]
comments = [p.get("comments", 0) for p in posts if p.get("comments") is not None]
if len(likes) < 10:
return {"verdict": "INSUFFICIENT_DATA"}
likes_cv = np.std(likes) / max(np.mean(likes), 1)
comments_cv = np.std(comments) / max(np.mean(comments), 1)
suspicious_uniformity = likes_cv < 0.2 or comments_cv < 0.15
avg_likes = np.mean(likes)
avg_comments = np.mean(comments)
ratio = avg_likes / max(avg_comments, 1)
suspicious_ratio = ratio < 5 or ratio > 200
sorted_likes = sorted(likes, reverse=True)
top_10_pct = sorted_likes[:max(len(sorted_likes) // 10, 1)]
bottom_90_pct = sorted_likes[max(len(sorted_likes) // 10, 1):]
spike_ratio = np.mean(top_10_pct) / max(np.mean(bottom_90_pct), 1)
suspicious_spikes = spike_ratio > 10
return {
"likes_cv": round(likes_cv, 3),
"comments_cv": round(comments_cv, 3),
"like_comment_ratio": round(ratio, 1),
"spike_ratio": round(spike_ratio, 1),
"flags": {
"uniform_engagement": suspicious_uniformity,
"abnormal_ratio": suspicious_ratio,
"engagement_spikes": suspicious_spikes
},
"verdict": (
"SUSPICIOUS" if sum([suspicious_uniformity, suspicious_ratio, suspicious_spikes]) >= 2
else "REVIEW" if any([suspicious_uniformity, suspicious_ratio, suspicious_spikes])
else "LIKELY_AUTHENTIC"
)
}
Bot Follower Detection
class FollowerAnalyzer:
BOT_INDICATORS = {
"no_profile_pic": 0.3, "no_posts": 0.25,
"username_pattern": 0.2, "following_much_more": 0.15, "no_bio": 0.1
}
def analyze_follower_sample(self, followers):
bot_scores = []
for follower in followers:
score = 0
if not follower.get("has_profile_pic", True):
score += self.BOT_INDICATORS["no_profile_pic"]
if follower.get("post_count", 1) == 0:
score += self.BOT_INDICATORS["no_posts"]
if re.match(r"^[a-z]{3,8}\d{4,8}$", follower.get("username", "")):
score += self.BOT_INDICATORS["username_pattern"]
if follower.get("following", 0) > 5 * max(follower.get("followers", 1), 1):
score += self.BOT_INDICATORS["following_much_more"]
if not follower.get("bio"):
score += self.BOT_INDICATORS["no_bio"]
bot_scores.append(score)
high_bot = sum(1 for s in bot_scores if s > 0.5)
fake_pct = high_bot / max(len(bot_scores), 1) * 100
return {
"sample_size": len(followers),
"estimated_fake_pct": round(fake_pct, 1),
"avg_bot_score": round(np.mean(bot_scores), 3),
"verdict": (
"HIGH_FRAUD_RISK" if fake_pct > 30
else "MODERATE_RISK" if fake_pct > 15
else "LOW_RISK"
)
}
Comprehensive Vetting Report
def generate_vetting_report(profile_url, posts, followers):
profile = scrape_public_profile(profile_url)
engagement = EngagementAnalyzer().analyze_engagement_distribution(posts)
follower_quality = FollowerAnalyzer().analyze_follower_sample(followers)
overall_risk = "HIGH" if (
engagement["verdict"] == "SUSPICIOUS" or
follower_quality["verdict"] == "HIGH_FRAUD_RISK"
) else "MEDIUM" if (
engagement["verdict"] == "REVIEW" or
follower_quality["verdict"] == "MODERATE_RISK"
) else "LOW"
return {
"profile": profile,
"engagement_analysis": engagement,
"follower_analysis": follower_quality,
"overall_risk": overall_risk,
"recommendation": (
"DO NOT PARTNER" if overall_risk == "HIGH"
else "PROCEED WITH CAUTION" if overall_risk == "MEDIUM"
else "LIKELY SAFE"
)
}
Proxy Infrastructure for Social Scraping
Social platforms aggressively block scrapers. ThorData residential proxies mimic real user connections. ScrapeOps helps optimize request patterns and monitor success rates.
Ethical Considerations
This tool analyzes publicly available data only. Never scrape private accounts, and always comply with platform terms of service. The goal is protecting brands from fraud, not invading privacy.
Influencer marketing fraud is a multi-billion dollar problem. Data-driven vetting is the solution.
Top comments (0)