agenthustler

Posted on Mar 27

How to Build a B2C Lead Qualifier with Social Profile Scraping

#python #tutorial #webdev #programming

Why Qualify Leads with Social Data

Not all leads are equal. A signup from a CEO with 50K followers is worth more than an anonymous Gmail address. Social profile data lets you score and prioritize leads automatically, so your sales team focuses on high-value prospects.

Lets build a lead qualification engine that enriches contact data with social signals.

What We Extract

Professional title and company
Social following and engagement rates
Content themes and interests
Activity level and recency
Network connections and influence score

Setup

pip install requests beautifulsoup4 pandas

Social platforms have aggressive bot detection. ScraperAPI handles proxy rotation and browser fingerprinting.

The Profile Scraper

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime

SCRAPER_API_KEY = "YOUR_KEY"

def scrape_social_profile(platform, username):
    urls = {
        "twitter": f"https://nitter.net/{username}",
        "github": f"https://github.com/{username}",
        "dev": f"https://dev.to/{username}"
    }

    target = urls.get(platform)
    if not target:
        return None

    url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={target}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    if platform == "github":
        return parse_github_profile(soup, username)
    elif platform == "twitter":
        return parse_twitter_profile(soup, username)
    elif platform == "dev":
        return parse_dev_profile(soup, username)
    return None

Platform-Specific Parsers

def parse_github_profile(soup, username):
    name = soup.select_one(".vcard-fullname, [itemprop=name]")
    bio = soup.select_one(".user-profile-bio, [data-bio-text]")
    followers = soup.select_one("a[href$=followers] .text-bold")
    repos = soup.select_one(".UnderlineNav-body a .Counter")
    company = soup.select_one(".vcard-detail [itemprop=worksFor]")

    return {
        "platform": "github",
        "username": username,
        "name": name.text.strip() if name else None,
        "bio": bio.text.strip() if bio else None,
        "followers": extract_number(followers),
        "repos": extract_number(repos),
        "company": company.text.strip() if company else None
    }

def parse_twitter_profile(soup, username):
    name = soup.select_one(".profile-card-fullname")
    bio = soup.select_one(".profile-card-bio")
    followers = soup.select_one(".profile-stat-num")

    return {
        "platform": "twitter",
        "username": username,
        "name": name.text.strip() if name else None,
        "bio": bio.text.strip() if bio else None,
        "followers": extract_number(followers)
    }

def parse_dev_profile(soup, username):
    name = soup.select_one(".profile-header__name, h1")
    bio = soup.select_one(".profile-header__bio, .js-user-info")

    return {
        "platform": "dev",
        "username": username,
        "name": name.text.strip() if name else None,
        "bio": bio.text.strip() if bio else None
    }

def extract_number(el):
    if not el:
        return 0
    text = el.text.strip().replace(",", "")
    match = re.search(r"\d+", text)
    return int(match.group()) if match else 0

Lead Scoring Engine

def score_lead(profiles):
    score = 0
    reasons = []

    total_followers = sum(p.get("followers", 0) for p in profiles)

    # Follower score (0-30 points)
    if total_followers > 10000:
        score += 30
        reasons.append(f"High influence: {total_followers} followers")
    elif total_followers > 1000:
        score += 20
        reasons.append(f"Medium influence: {total_followers} followers")
    elif total_followers > 100:
        score += 10
        reasons.append(f"Some influence: {total_followers} followers")

    # Company presence (0-25 points)
    companies = [p.get("company") for p in profiles if p.get("company")]
    if companies:
        score += 25
        reasons.append(f"Company: {companies[0]}")

    # Bio completeness (0-15 points)
    bios = [p.get("bio") for p in profiles if p.get("bio")]
    if bios:
        score += 15
        reasons.append("Has professional bio")

    # Multi-platform presence (0-20 points)
    platforms = len(profiles)
    score += min(20, platforms * 10)
    reasons.append(f"Active on {platforms} platforms")

    # Activity signals (0-10 points)
    repos = sum(p.get("repos", 0) for p in profiles)
    if repos > 10:
        score += 10
        reasons.append(f"Active builder: {repos} repos")

    return {
        "score": score,
        "grade": grade_lead(score),
        "reasons": reasons
    }

def grade_lead(score):
    if score >= 80:
        return "A"
    elif score >= 60:
        return "B"
    elif score >= 40:
        return "C"
    elif score >= 20:
        return "D"
    return "F"

Batch Lead Qualification

import time

def qualify_leads(leads):
    results = []

    for lead in leads:
        profiles = []
        for platform, username in lead.get("social", {}).items():
            try:
                profile = scrape_social_profile(platform, username)
                if profile:
                    profiles.append(profile)
                time.sleep(1)
            except Exception as e:
                print(f"Error scraping {platform}/{username}: {e}")

        scoring = score_lead(profiles)
        results.append({
            "email": lead["email"],
            "score": scoring["score"],
            "grade": scoring["grade"],
            "reasons": "; ".join(scoring["reasons"]),
            "profiles_found": len(profiles)
        })

    df = pd.DataFrame(results).sort_values("score", ascending=False)
    return df

# Example usage
leads = [
    {"email": "alice@company.com", "social": {"github": "alice-dev", "twitter": "alice"}},
    {"email": "bob@startup.io", "social": {"github": "bobcoder"}},
    {"email": "carol@bigcorp.com", "social": {"twitter": "carol_exec", "github": "carol"}}
]

df = qualify_leads(leads)
print(df.to_string())

Output Example

          email  score grade                         reasons  profiles_found
alice@company.com    75     B  Medium influence; Has bio; 2 platforms     2
carol@bigcorp.com    65     B  Company: BigCorp; Has bio; 2 platforms     2
bob@startup.io       35     C  Some influence; Active builder          1

Proxy Strategy

Social platforms are the hardest sites to scrape. Use ThorData residential proxies for platforms that block datacenter IPs. Monitor success rates with ScrapeOps.

Ethical Considerations

Only scrape publicly available profiles
Respect robots.txt and rate limits
Comply with GDPR and CCPA for lead data
Never scrape private or protected accounts
Store enrichment data securely with encryption

Conclusion

Automated lead qualification with social data lets you prioritize high-value prospects without manual research. With ScraperAPI handling the scraping infrastructure, you can focus on building the scoring logic that matters for your business.

DEV Community