DEV Community

agenthustler
agenthustler

Posted on

How to Build a B2C Lead Qualifier with Social Profile Scraping

Why Qualify Leads with Social Data

Not all leads are equal. A signup from a CEO with 50K followers is worth more than an anonymous Gmail address. Social profile data lets you score and prioritize leads automatically, so your sales team focuses on high-value prospects.

Lets build a lead qualification engine that enriches contact data with social signals.

What We Extract

  • Professional title and company
  • Social following and engagement rates
  • Content themes and interests
  • Activity level and recency
  • Network connections and influence score

Setup

pip install requests beautifulsoup4 pandas
Enter fullscreen mode Exit fullscreen mode

Social platforms have aggressive bot detection. ScraperAPI handles proxy rotation and browser fingerprinting.

The Profile Scraper

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime

SCRAPER_API_KEY = "YOUR_KEY"

def scrape_social_profile(platform, username):
    urls = {
        "twitter": f"https://nitter.net/{username}",
        "github": f"https://github.com/{username}",
        "dev": f"https://dev.to/{username}"
    }

    target = urls.get(platform)
    if not target:
        return None

    url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={target}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    if platform == "github":
        return parse_github_profile(soup, username)
    elif platform == "twitter":
        return parse_twitter_profile(soup, username)
    elif platform == "dev":
        return parse_dev_profile(soup, username)
    return None
Enter fullscreen mode Exit fullscreen mode

Platform-Specific Parsers

def parse_github_profile(soup, username):
    name = soup.select_one(".vcard-fullname, [itemprop=name]")
    bio = soup.select_one(".user-profile-bio, [data-bio-text]")
    followers = soup.select_one("a[href$=followers] .text-bold")
    repos = soup.select_one(".UnderlineNav-body a .Counter")
    company = soup.select_one(".vcard-detail [itemprop=worksFor]")

    return {
        "platform": "github",
        "username": username,
        "name": name.text.strip() if name else None,
        "bio": bio.text.strip() if bio else None,
        "followers": extract_number(followers),
        "repos": extract_number(repos),
        "company": company.text.strip() if company else None
    }

def parse_twitter_profile(soup, username):
    name = soup.select_one(".profile-card-fullname")
    bio = soup.select_one(".profile-card-bio")
    followers = soup.select_one(".profile-stat-num")

    return {
        "platform": "twitter",
        "username": username,
        "name": name.text.strip() if name else None,
        "bio": bio.text.strip() if bio else None,
        "followers": extract_number(followers)
    }

def parse_dev_profile(soup, username):
    name = soup.select_one(".profile-header__name, h1")
    bio = soup.select_one(".profile-header__bio, .js-user-info")

    return {
        "platform": "dev",
        "username": username,
        "name": name.text.strip() if name else None,
        "bio": bio.text.strip() if bio else None
    }

def extract_number(el):
    if not el:
        return 0
    text = el.text.strip().replace(",", "")
    match = re.search(r"\d+", text)
    return int(match.group()) if match else 0
Enter fullscreen mode Exit fullscreen mode

Lead Scoring Engine

def score_lead(profiles):
    score = 0
    reasons = []

    total_followers = sum(p.get("followers", 0) for p in profiles)

    # Follower score (0-30 points)
    if total_followers > 10000:
        score += 30
        reasons.append(f"High influence: {total_followers} followers")
    elif total_followers > 1000:
        score += 20
        reasons.append(f"Medium influence: {total_followers} followers")
    elif total_followers > 100:
        score += 10
        reasons.append(f"Some influence: {total_followers} followers")

    # Company presence (0-25 points)
    companies = [p.get("company") for p in profiles if p.get("company")]
    if companies:
        score += 25
        reasons.append(f"Company: {companies[0]}")

    # Bio completeness (0-15 points)
    bios = [p.get("bio") for p in profiles if p.get("bio")]
    if bios:
        score += 15
        reasons.append("Has professional bio")

    # Multi-platform presence (0-20 points)
    platforms = len(profiles)
    score += min(20, platforms * 10)
    reasons.append(f"Active on {platforms} platforms")

    # Activity signals (0-10 points)
    repos = sum(p.get("repos", 0) for p in profiles)
    if repos > 10:
        score += 10
        reasons.append(f"Active builder: {repos} repos")

    return {
        "score": score,
        "grade": grade_lead(score),
        "reasons": reasons
    }

def grade_lead(score):
    if score >= 80:
        return "A"
    elif score >= 60:
        return "B"
    elif score >= 40:
        return "C"
    elif score >= 20:
        return "D"
    return "F"
Enter fullscreen mode Exit fullscreen mode

Batch Lead Qualification

import time

def qualify_leads(leads):
    results = []

    for lead in leads:
        profiles = []
        for platform, username in lead.get("social", {}).items():
            try:
                profile = scrape_social_profile(platform, username)
                if profile:
                    profiles.append(profile)
                time.sleep(1)
            except Exception as e:
                print(f"Error scraping {platform}/{username}: {e}")

        scoring = score_lead(profiles)
        results.append({
            "email": lead["email"],
            "score": scoring["score"],
            "grade": scoring["grade"],
            "reasons": "; ".join(scoring["reasons"]),
            "profiles_found": len(profiles)
        })

    df = pd.DataFrame(results).sort_values("score", ascending=False)
    return df

# Example usage
leads = [
    {"email": "alice@company.com", "social": {"github": "alice-dev", "twitter": "alice"}},
    {"email": "bob@startup.io", "social": {"github": "bobcoder"}},
    {"email": "carol@bigcorp.com", "social": {"twitter": "carol_exec", "github": "carol"}}
]

df = qualify_leads(leads)
print(df.to_string())
Enter fullscreen mode Exit fullscreen mode

Output Example

          email  score grade                         reasons  profiles_found
alice@company.com    75     B  Medium influence; Has bio; 2 platforms     2
carol@bigcorp.com    65     B  Company: BigCorp; Has bio; 2 platforms     2
bob@startup.io       35     C  Some influence; Active builder          1
Enter fullscreen mode Exit fullscreen mode

Proxy Strategy

Social platforms are the hardest sites to scrape. Use ThorData residential proxies for platforms that block datacenter IPs. Monitor success rates with ScrapeOps.

Ethical Considerations

  • Only scrape publicly available profiles
  • Respect robots.txt and rate limits
  • Comply with GDPR and CCPA for lead data
  • Never scrape private or protected accounts
  • Store enrichment data securely with encryption

Conclusion

Automated lead qualification with social data lets you prioritize high-value prospects without manual research. With ScraperAPI handling the scraping infrastructure, you can focus on building the scoring logic that matters for your business.

Top comments (0)