Manual lead qualification is the biggest time sink in most small businesses. A salesperson spends 40-60% of their time on leads that will never convert. LLMs change this equation — when deployed correctly, they qualify leads with 85%+ accuracy at 200ms response times, freeing your team to focus on the 20% of leads worth their attention.
At AI Buddy, we've deployed this for dozens of Israeli businesses across e-commerce, home services, B2B SaaS, and healthcare. Here's the real implementation.
What "Lead Qualification" Actually Means
Before writing code, define what you're qualifying for. The BANT framework (Budget, Authority, Need, Timeline) is a reasonable starting point:
- Budget: Can they afford your product/service?
- Authority: Are they the decision maker?
- Need: Do they have the problem you solve?
- Timeline: Are they buying soon or just browsing?
You need to extract these from unstructured conversation text and produce a score. Here's the data model:
from dataclasses import dataclass, field
from typing import Optional
from enum import Enum
class QualificationTier(Enum):
HOT = "hot" # Score 80-100: call within 1 hour
WARM = "warm" # Score 50-79: follow up within 24 hours
COLD = "cold" # Score 20-49: nurture sequence
DISQUALIFIED = "disqualified" # Score 0-19: not a fit
@dataclass
class LeadScore:
budget_score: int = 0 # 0-25
authority_score: int = 0 # 0-25
need_score: int = 0 # 0-25
timeline_score: int = 0 # 0-25
raw_data: dict = field(default_factory=dict)
@property
def total(self) -> int:
return self.budget_score + self.authority_score + self.need_score + self.timeline_score
@property
def tier(self) -> QualificationTier:
score = self.total
if score >= 80:
return QualificationTier.HOT
elif score >= 50:
return QualificationTier.WARM
elif score >= 20:
return QualificationTier.COLD
else:
return QualificationTier.DISQUALIFIED
Prompt Engineering for Structured Extraction
The hardest part of LLM-based qualification is getting reliable structured output. Here's what works in production — a two-stage approach:
Stage 1: Extract facts from conversation
EXTRACTION_PROMPT = """Analyze this customer conversation and extract qualification signals.
Conversation:
{conversation}
Extract the following information. If not mentioned, use null. Be conservative — only mark something as present if the customer actually said it.
Return ONLY valid JSON, no commentary:
{
"budget": {
"mentioned": true/false,
"amount_indicated": "under 1k / 1k-5k / 5k-20k / 20k+ / unknown",
"sensitivity": "price_sensitive / neutral / not_concerned",
"quotes": ["exact quote where budget was discussed"]
},
"authority": {
"is_decision_maker": true/false/null,
"mentioned_others": true/false,
"role_signals": ["any role/title mentioned"],
"quotes": ["exact quote"]
},
"need": {
"problem_stated": true/false,
"problem_description": "one sentence",
"urgency_signals": ["phrases indicating urgency"],
"fit_score_hint": "clear_fit / possible_fit / poor_fit / unknown"
},
"timeline": {
"mentioned": true/false,
"timeframe": "immediate / this_month / this_quarter / this_year / just_browsing / unknown",
"quotes": ["exact quote"]
},
"red_flags": ["any disqualifying signals like wrong geography, impossible requirements, etc."]
}"""
import anthropic
import json
import re
client = anthropic.Anthropic()
def extract_qualification_signals(conversation: str) -> dict:
"""Extract structured qualification data from conversation text."""
response = client.messages.create(
model="claude-opus-4-5",
max_tokens=1024,
messages=[{
"role": "user",
"content": EXTRACTION_PROMPT.format(conversation=conversation)
}]
)
text = response.content[0].text
# Strip any markdown code blocks if the model adds them
text = re.sub(r'^```
(?:json)?\n?', '', text.strip())
text = re.sub(r'\n?
```$', '', text)
return json.loads(text)
Stage 2: Score from extracted signals
def score_lead(signals: dict) -> LeadScore:
"""Convert extracted signals to numeric scores."""
score = LeadScore()
# Budget scoring (0-25)
budget = signals.get("budget", {})
if budget.get("mentioned"):
amount_map = {
"20k+": 25,
"5k-20k": 20,
"1k-5k": 12,
"under 1k": 5,
"unknown": 10
}
score.budget_score = amount_map.get(budget.get("amount_indicated", "unknown"), 10)
# Penalize price sensitivity
if budget.get("sensitivity") == "price_sensitive":
score.budget_score = max(0, score.budget_score - 8)
else:
score.budget_score = 10 # Neutral if not discussed
# Authority scoring (0-25)
authority = signals.get("authority", {})
if authority.get("is_decision_maker") is True:
score.authority_score = 25
elif authority.get("is_decision_maker") is None:
score.authority_score = 12 # Unknown — don't penalize
elif authority.get("mentioned_others"):
score.authority_score = 8 # Has to check with someone
else:
score.authority_score = 0 # Explicitly not the decision maker
# Need scoring (0-25)
need = signals.get("need", {})
fit_map = {
"clear_fit": 25,
"possible_fit": 15,
"poor_fit": 0,
"unknown": 8
}
score.need_score = fit_map.get(need.get("fit_score_hint", "unknown"), 8)
if need.get("urgency_signals"):
score.need_score = min(25, score.need_score + 5)
# Timeline scoring (0-25)
timeline = signals.get("timeline", {})
timeline_map = {
"immediate": 25,
"this_month": 20,
"this_quarter": 12,
"this_year": 6,
"just_browsing": 0,
"unknown": 8
}
score.timeline_score = timeline_map.get(timeline.get("timeframe", "unknown"), 8)
# Red flags override
if signals.get("red_flags"):
score.budget_score = 0
score.need_score = 0
score.raw_data = signals
return score
The Full Pipeline
Putting extraction and scoring together:
from typing import Tuple
import time
import logging
logger = logging.getLogger(__name__)
def qualify_lead(conversation: str, lead_id: str) -> Tuple[LeadScore, dict]:
"""
Full qualification pipeline.
Returns (score, metadata) where metadata includes timing and model info.
"""
start = time.time()
try:
# Step 1: Extract signals
signals = extract_qualification_signals(conversation)
extraction_time = time.time() - start
# Step 2: Score
score = score_lead(signals)
total_time = time.time() - start
metadata = {
"lead_id": lead_id,
"extraction_ms": int(extraction_time * 1000),
"total_ms": int(total_time * 1000),
"model": "claude-opus-4-5",
"tier": score.tier.value,
"total_score": score.total
}
logger.info(
f"Lead {lead_id} qualified: {score.tier.value} ({score.total}/100) "
f"in {metadata['total_ms']}ms"
)
return score, metadata
except json.JSONDecodeError as e:
logger.error(f"JSON parse failed for lead {lead_id}: {e}")
# Return a neutral score rather than crashing
return LeadScore(budget_score=10, authority_score=10, need_score=10, timeline_score=10), {
"lead_id": lead_id,
"error": "extraction_failed",
"fallback": True
}
except Exception as e:
logger.error(f"Qualification pipeline failed for {lead_id}: {e}")
raise
Real Performance Numbers
From our production deployments:
| Metric | Value |
|---|---|
| Average extraction time | 180-220ms (claude-haiku) |
| Average extraction time | 800-1200ms (claude-opus) |
| Extraction accuracy vs human | 88% agreement |
| False positive rate (HOT tier) | 11% |
| False negative rate (missing HOT leads) | 4% |
| Cost per qualification (claude-haiku) | ~$0.003 |
| Cost per qualification (claude-opus) | ~$0.04 |
We use haiku for initial triage and opus for leads flagged as potentially HOT. The hybrid approach costs ~$0.006 average per lead.
CRM Integration
Scores mean nothing without action. Push to your CRM immediately:
import requests
from datetime import datetime
def push_qualified_lead_to_crm(
lead_data: dict,
score: LeadScore,
crm_base_url: str,
api_key: str
):
"""Push a qualified lead to your CRM with routing logic."""
tier = score.tier
# Map tiers to CRM pipeline stages
pipeline_map = {
QualificationTier.HOT: "Sales Qualified",
QualificationTier.WARM: "Marketing Qualified",
QualificationTier.COLD: "Nurture",
QualificationTier.DISQUALIFIED: "Closed Lost"
}
# Map tiers to assignment priority
priority_map = {
QualificationTier.HOT: "urgent",
QualificationTier.WARM: "high",
QualificationTier.COLD: "normal",
QualificationTier.DISQUALIFIED: "low"
}
payload = {
"contact": {
"name": lead_data.get("name"),
"phone": lead_data.get("phone"),
"email": lead_data.get("email")
},
"qualification": {
"score": score.total,
"tier": tier.value,
"breakdown": {
"budget": score.budget_score,
"authority": score.authority_score,
"need": score.need_score,
"timeline": score.timeline_score
},
"pipeline_stage": pipeline_map[tier],
"priority": priority_map[tier]
},
"metadata": {
"source": "ai_qualification",
"qualified_at": datetime.utcnow().isoformat(),
"raw_signals": score.raw_data
}
}
response = requests.post(
f"{crm_base_url}/api/leads",
json=payload,
headers={"Authorization": f"Bearer {api_key}"},
timeout=10
)
response.raise_for_status()
# Trigger immediate action for HOT leads
if tier == QualificationTier.HOT:
trigger_hot_lead_alert(lead_data, score)
return response.json()
def trigger_hot_lead_alert(lead_data: dict, score: LeadScore):
"""Alert sales team immediately for HOT leads."""
slack_url = os.environ.get("SLACK_SALES_WEBHOOK")
if not slack_url:
return
requests.post(slack_url, json={
"text": f"🔥 HOT LEAD — {score.total}/100\n"
f"Name: {lead_data.get('name', 'Unknown')}\n"
f"Phone: {lead_data.get('phone', 'Unknown')}\n"
f"Budget: {score.budget_score}/25 | "
f"Authority: {score.authority_score}/25 | "
f"Need: {score.need_score}/25 | "
f"Timeline: {score.timeline_score}/25\n"
f"Problem: {score.raw_data.get('need', {}).get('problem_description', 'N/A')}"
}, timeout=5)
Validation and Continuous Improvement
Your qualification model will drift. Build validation in from the start:
def log_qualification_outcome(lead_id: str, actual_outcome: str):
"""
Log actual sales outcomes to validate and improve the model.
actual_outcome: 'won' | 'lost' | 'nurturing' | 'disqualified'
"""
# Store in your analytics DB
conn = get_db()
conn.execute("""
INSERT INTO qualification_outcomes (lead_id, actual_outcome, logged_at)
VALUES (?, ?, CURRENT_TIMESTAMP)
ON CONFLICT (lead_id) DO UPDATE SET actual_outcome = excluded.actual_outcome
""", (lead_id, actual_outcome))
conn.commit()
def calculate_model_accuracy(days_back: int = 30) -> dict:
"""
Compare predicted tiers vs actual outcomes over the last N days.
Run this weekly to catch model drift.
"""
conn = get_db()
results = conn.execute("""
SELECT
lq.predicted_tier,
lo.actual_outcome,
COUNT(*) as count
FROM lead_qualifications lq
JOIN qualification_outcomes lo ON lq.lead_id = lo.lead_id
WHERE lq.created_at > datetime('now', ?)
GROUP BY lq.predicted_tier, lo.actual_outcome
""", (f"-{days_back} days",)).fetchall()
return {"breakdown": [dict(r) for r in results]}
Lessons Learned
Don't score in one LLM call. Separating extraction from scoring makes the system debuggable. When a score is wrong, you can inspect the extracted signals and see exactly what went wrong.
Neutral scores for missing data. If someone hasn't mentioned budget, don't score them 0 on budget. Score them neutral (10/25). You don't know yet — don't disqualify on ignorance.
Recalibrate monthly. Compare HOT leads from 60 days ago against actual close rates. If your HOT tier closes at 20% (it should be 60%+), your scoring weights are off.
Human review for borderline cases. Leads scoring 45-55 should get a human look. The model is most uncertain here.
AI Buddy offers fully managed AI qualification pipelines if you'd rather configure than build.
Top comments (0)