Overview
This tutorial demonstrates constructing a customer support voice agent using AssemblyAI's Universal-3 Pro Streaming for speech-to-text, OpenAI GPT-4o for LLM orchestration with function calling, and ElevenLabs for voice synthesis.
The Core Problem: Transcription Accuracy
The core challenge with function-calling voice agents is transcription accuracy. When customers provide specific entities like order IDs, phone numbers, or email addresses — the exact data your functions need — poor STT quality causes silent failures. Garbage in, garbage function call out.
Universal-3 Pro Streaming addresses this with superior entity recognition: 34.79% missed entity rate for phone numbers versus 37.11% for previous models, and 59.64% for emails versus 89.09% previously.
What You'll Build
Three customer support scenarios:
- Check order status — Retrieve status via order ID
- Schedule a callback — Capture name and phone number
- Transfer to human — Escalate when needed
Tech Stack:
- AssemblyAI Universal-3 Pro Streaming (STT)
- OpenAI GPT-4o (LLM with function calling)
- ElevenLabs (text-to-speech)
- Python 3.9+
Setup & Installation
pip install websockets openai elevenlabs pyaudio python-dotenv
ASSEMBLYAI_API_KEY=your_key_here
OPENAI_API_KEY=your_key_here
ELEVENLABS_API_KEY=your_key_here
Step 1: Connect to Universal-3 Pro Streaming
Universal-3 Pro Streaming operates via WebSocket, streaming audio chunks and receiving Turn messages with transcripts.
import asyncio
import json
import os
import websockets
import pyaudio
from dotenv import load_dotenv
load_dotenv()
ASSEMBLYAI_WS_URL = "wss://streaming.assemblyai.com/v3/ws"
SAMPLE_RATE = 16000
CHUNK_SIZE = 8000 # 500ms at 16kHz
async def connect_assemblyai():
params = {
"speech_model": "u3-rt-pro",
"sample_rate": SAMPLE_RATE,
"format_turns": "true",
}
query = "&".join(f"{k}={v}" for k, v in params.items())
url = f"{ASSEMBLYAI_WS_URL}?{query}"
ws = await websockets.connect(
url,
extra_headers={"Authorization": os.getenv("ASSEMBLYAI_API_KEY")}
)
return ws
Handling Turn Messages
async def receive_transcripts(ws, on_turn_complete):
async for message in ws:
data = json.loads(message)
msg_type = data.get("type")
if msg_type == "Begin":
print(f"Session started: {data['id']}")
elif msg_type == "Turn":
transcript = data.get("transcript", "")
if data.get("end_of_turn") and transcript.strip():
print(f"User said: {transcript}")
await on_turn_complete(transcript)
elif msg_type == "Termination":
print("Session ended.")
break
Step 2: Define Your Functions
GPT-4o uses JSON Schema to understand available tools and their parameters.
TOOLS = [
{
"type": "function",
"function": {
"name": "get_order_status",
"description": "Look up the status of a customer order by order ID.",
"parameters": {
"type": "object",
"properties": {
"order_id": {
"type": "string",
"description": "The customer's order ID, e.g. AB3792"
}
},
"required": ["order_id"]
}
}
},
{
"type": "function",
"function": {
"name": "schedule_callback",
"description": "Schedule a callback for a customer who wants to be called back.",
"parameters": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The customer's full name"
},
"phone_number": {
"type": "string",
"description": "The customer's phone number in any format"
}
},
"required": ["name", "phone_number"]
}
}
},
{
"type": "function",
"function": {
"name": "transfer_to_human",
"description": "Transfer the customer to a human agent when requested or when the issue can't be resolved.",
"parameters": {
"type": "object",
"properties": {
"reason": {
"type": "string",
"description": "Brief reason for the transfer"
}
},
"required": ["reason"]
}
}
}
]
Function Handlers
def get_order_status(order_id: str) -> str:
mock_orders = {
"AB3792": "Shipped — expected delivery April 15",
"CD1204": "Processing — ships within 2 business days",
}
result = mock_orders.get(order_id.upper())
return result if result else f"No order found with ID {order_id}"
def schedule_callback(name: str, phone_number: str) -> str:
print(f"[SYSTEM] Callback scheduled: {name} at {phone_number}")
return f"Got it. We'll call {name} at {phone_number} within 2 hours."
def transfer_to_human(reason: str) -> str:
print(f"[SYSTEM] Transferring to human. Reason: {reason}")
return "Transferring you now. Please hold for a moment."
FUNCTION_MAP = {
"get_order_status": get_order_status,
"schedule_callback": schedule_callback,
"transfer_to_human": transfer_to_human,
}
Step 3: Wire Up the LLM with Function Calling
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
conversation_history = [
{
"role": "system",
"content": (
"You are a helpful customer support voice agent. "
"Keep responses short — this is a phone call. "
"When a customer mentions an order ID, phone number, or name, "
"use the appropriate tool. Always confirm details before acting."
)
}
]
async def process_with_llm(transcript: str) -> str:
conversation_history.append({"role": "user", "content": transcript})
response = client.chat.completions.create(
model="gpt-4o",
messages=conversation_history,
tools=TOOLS,
tool_choice="auto"
)
message = response.choices[0].message
if not message.tool_calls:
reply = message.content
conversation_history.append({"role": "assistant", "content": reply})
return reply
conversation_history.append(message)
results = []
for tool_call in message.tool_calls:
fn_name = tool_call.function.name
fn_args = json.loads(tool_call.function.arguments)
print(f"[TOOL] Calling {fn_name} with {fn_args}")
fn_result = FUNCTION_MAP[fn_name](**fn_args)
results.append({
"tool_call_id": tool_call.id,
"role": "tool",
"content": fn_result
})
conversation_history.extend(results)
follow_up = client.chat.completions.create(
model="gpt-4o",
messages=conversation_history
)
reply = follow_up.choices[0].message.content
conversation_history.append({"role": "assistant", "content": reply})
return reply
Step 4: Add Text-to-Speech
from elevenlabs.client import ElevenLabs
from elevenlabs import stream as el_stream
el_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
def speak(text: str):
print(f"Agent: {text}")
audio = el_client.text_to_speech.convert(
text=text,
voice_id="EXAVITQu4vr4xnSDxMaL",
model_id="eleven_turbo_v2",
output_format="pcm_16000"
)
el_stream(audio)
Step 5: Putting It All Together
import threading
async def run_agent():
ws = await connect_assemblyai()
async def on_turn_complete(transcript: str):
reply = await process_with_llm(transcript)
speak(reply)
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNK_SIZE
)
async def send_audio():
try:
while True:
chunk = stream.read(CHUNK_SIZE, exception_on_overflow=False)
await ws.send(chunk)
await asyncio.sleep(0)
except websockets.ConnectionClosed:
pass
await asyncio.gather(
send_audio(),
receive_transcripts(ws, on_turn_complete)
)
if __name__ == "__main__":
print("Voice agent ready. Speak to begin.")
asyncio.run(run_agent())
Why Entity Accuracy Matters
When a customer says "My order number is A-B-3-7-9-2," lower-accuracy STT produces garbled results like "a b 3792" or "ABE 37 92." Function lookup fails. The agent cannot help.
Universal-3 Pro Streaming's superior entity recognition ensures phone numbers, order IDs, and email addresses transcribe correctly — making function calls reliable.
Next Steps
- Keyterms prompting — Pass known order ID formats to boost recognition
- Streaming TTS — Output speech while functions execute for lower latency
- Interruption handling — Allow customer to interrupt mid-response
- Production audio — Deploy via Twilio or LiveKit with native AssemblyAI integrations
Top comments (0)