Introduction
This guide shows you how to implement Text-to-Speech using ElevenLabs and Deepgram Aura in a .NET 8 application, with support for streaming audio and voice customization.
Interface Definition
public interface ITTSProvider
{
Task<byte[]> SynthesizeAsync(string text, VoiceOptions? options = null);
IAsyncEnumerable<byte[]> StreamSynthesizeAsync(string text, VoiceOptions? options = null);
Task<List<Voice>> GetAvailableVoicesAsync();
}
public record VoiceOptions(
string? VoiceId = null,
float? Speed = null,
float? Pitch = null,
AudioFormat Format = AudioFormat.PCM16);
public record Voice(string Id, string Name, string Description, string? PreviewUrl);
public enum AudioFormat
{
PCM16,
MP3,
OGG
}
ElevenLabs Implementation
Setup
dotnet add package ElevenLabs
Configuration
// appsettings.json
{
"ElevenLabs": {
"ApiKey": "your-api-key",
"DefaultVoice": "Rachel",
"Model": "eleven_turbo_v2"
}
}
Implementation
using ElevenLabs;
using ElevenLabs.Models;
public class ElevenLabsTTSProvider : ITTSProvider
{
private readonly ElevenLabsClient _client;
private readonly string _defaultVoice;
private readonly string _model;
private readonly ILogger<ElevenLabsTTSProvider> _logger;
public ElevenLabsTTSProvider(
IConfiguration configuration,
ILogger<ElevenLabsTTSProvider> logger)
{
var apiKey = configuration["ElevenLabs:ApiKey"];
_defaultVoice = configuration["ElevenLabs:DefaultVoice"] ?? "Rachel";
_model = configuration["ElevenLabs:Model"] ?? "eleven_turbo_v2";
_client = new ElevenLabsClient(apiKey);
_logger = logger;
}
public async Task<byte[]> SynthesizeAsync(string text, VoiceOptions? options = null)
{
var voiceId = options?.VoiceId ?? _defaultVoice;
var request = new TextToSpeechRequest
{
Text = text,
ModelId = _model,
VoiceSettings = new VoiceSettings
{
Stability = 0.5,
SimilarityBoost = 0.75,
Style = 0.0,
UseSpeakerBoost = true
}
};
var outputFormat = MapAudioFormat(options?.Format ?? AudioFormat.PCM16);
using var audioStream = await _client.TextToSpeech.ConvertAsync(
voiceId,
request,
outputFormat);
using var memoryStream = new MemoryStream();
await audioStream.CopyToAsync(memoryStream);
_logger.LogInformation(
"Generated {Bytes} bytes of audio for text length {Length}",
memoryStream.Length,
text.Length);
return memoryStream.ToArray();
}
public async IAsyncEnumerable<byte[]> StreamSynthesizeAsync(
string text,
VoiceOptions? options = null)
{
var voiceId = options?.VoiceId ?? _defaultVoice;
var request = new TextToSpeechRequest
{
Text = text,
ModelId = _model,
VoiceSettings = new VoiceSettings
{
Stability = 0.5,
SimilarityBoost = 0.75
}
};
var outputFormat = MapAudioFormat(options?.Format ?? AudioFormat.PCM16);
await using var audioStream = await _client.TextToSpeech.ConvertStreamAsync(
voiceId,
request,
outputFormat);
var buffer = new byte[4096];
int bytesRead;
while ((bytesRead = await audioStream.ReadAsync(buffer, 0, buffer.Length)) > 0)
{
var chunk = new byte[bytesRead];
Array.Copy(buffer, chunk, bytesRead);
yield return chunk;
}
}
public async Task<List<Voice>> GetAvailableVoicesAsync()
{
var voices = await _client.Voices.GetAllAsync();
return voices.Select(v => new Voice(
Id: v.VoiceId,
Name: v.Name,
Description: v.Description ?? string.Empty,
PreviewUrl: v.PreviewUrl
)).ToList();
}
private string MapAudioFormat(AudioFormat format)
{
return format switch
{
AudioFormat.PCM16 => "pcm_16000",
AudioFormat.MP3 => "mp3_44100_128",
AudioFormat.OGG => "ogg_opus",
_ => "pcm_16000"
};
}
}
Deepgram Aura Implementation
Setup
dotnet add package Deepgram
Configuration
// appsettings.json
{
"Deepgram": {
"ApiKey": "your-api-key",
"DefaultVoice": "aura-asteria-en"
}
}
Implementation
using Deepgram;
using Deepgram.Models;
public class DeepgramTTSProvider : ITTSProvider
{
private readonly DeepgramClient _client;
private readonly string _defaultVoice;
private readonly ILogger<DeepgramTTSProvider> _logger;
public DeepgramTTSProvider(
IConfiguration configuration,
ILogger<DeepgramTTSProvider> logger)
{
var apiKey = configuration["Deepgram:ApiKey"];
_defaultVoice = configuration["Deepgram:DefaultVoice"] ?? "aura-asteria-en";
_client = new DeepgramClient(apiKey);
_logger = logger;
}
public async Task<byte[]> SynthesizeAsync(string text, VoiceOptions? options = null)
{
var speakOptions = new SpeakOptions
{
Model = options?.VoiceId ?? _defaultVoice,
Encoding = MapAudioFormat(options?.Format ?? AudioFormat.PCM16),
SampleRate = 16000
};
var response = await _client.Speak.v("1").Synthesize(
new TextSource(text),
speakOptions);
_logger.LogInformation(
"Generated audio for text length {Length}",
text.Length);
return response.Data;
}
public async IAsyncEnumerable<byte[]> StreamSynthesizeAsync(
string text,
VoiceOptions? options = null)
{
var speakOptions = new SpeakOptions
{
Model = options?.VoiceId ?? _defaultVoice,
Encoding = MapAudioFormat(options?.Format ?? AudioFormat.PCM16),
SampleRate = 16000
};
await using var stream = await _client.Speak.v("1").SynthesizeStream(
new TextSource(text),
speakOptions);
var buffer = new byte[4096];
int bytesRead;
while ((bytesRead = await stream.ReadAsync(buffer, 0, buffer.Length)) > 0)
{
var chunk = new byte[bytesRead];
Array.Copy(buffer, chunk, bytesRead);
yield return chunk;
}
}
public async Task<List<Voice>> GetAvailableVoicesAsync()
{
// Deepgram Aura voices (as of 2024)
return await Task.FromResult(new List<Voice>
{
new Voice("aura-asteria-en", "Asteria", "Female, warm and friendly", null),
new Voice("aura-luna-en", "Luna", "Female, professional", null),
new Voice("aura-stella-en", "Stella", "Female, energetic", null),
new Voice("aura-athena-en", "Athena", "Female, clear", null),
new Voice("aura-hera-en", "Hera", "Female, authoritative", null),
new Voice("aura-orion-en", "Orion", "Male, deep", null),
new Voice("aura-arcas-en", "Arcas", "Male, neutral", null),
new Voice("aura-perseus-en", "Perseus", "Male, confident", null),
new Voice("aura-angus-en", "Angus", "Male, friendly", null),
new Voice("aura-orpheus-en", "Orpheus", "Male, smooth", null),
new Voice("aura-helios-en", "Helios", "Male, warm", null),
new Voice("aura-zeus-en", "Zeus", "Male, powerful", null)
});
}
private string MapAudioFormat(AudioFormat format)
{
return format switch
{
AudioFormat.PCM16 => "linear16",
AudioFormat.MP3 => "mp3",
AudioFormat.OGG => "opus",
_ => "linear16"
};
}
}
Dependency Injection Setup
// Program.cs
builder.Services.AddSingleton<ITTSProvider, ElevenLabsTTSProvider>();
// OR
builder.Services.AddSingleton<ITTSProvider, DeepgramTTSProvider>();
Usage Examples
Basic Text-to-Speech
[ApiController]
[Route("api/[controller]")]
public class SpeechController : ControllerBase
{
private readonly ITTSProvider _tts;
[HttpPost("synthesize")]
public async Task<IActionResult> Synthesize([FromBody] SynthesizeRequest request)
{
var options = new VoiceOptions(
VoiceId: request.VoiceId,
Format: AudioFormat.MP3);
var audioData = await _tts.SynthesizeAsync(request.Text, options);
return File(audioData, "audio/mpeg", "speech.mp3");
}
[HttpGet("voices")]
public async Task<IActionResult> GetVoices()
{
var voices = await _tts.GetAvailableVoicesAsync();
return Ok(voices);
}
}
public record SynthesizeRequest(string Text, string? VoiceId);
Streaming Audio via WebSocket
[HttpGet("stream")]
public async Task StreamAudio([FromQuery] string text, [FromQuery] string? voiceId)
{
if (!HttpContext.WebSockets.IsWebSocketRequest)
{
HttpContext.Response.StatusCode = 400;
return;
}
using var webSocket = await HttpContext.WebSockets.AcceptWebSocketAsync();
var options = new VoiceOptions(VoiceId: voiceId);
await foreach (var audioChunk in _tts.StreamSynthesizeAsync(text, options))
{
await webSocket.SendAsync(
new ArraySegment<byte>(audioChunk),
WebSocketMessageType.Binary,
true,
CancellationToken.None);
}
await webSocket.CloseAsync(
WebSocketCloseStatus.NormalClosure,
"Completed",
CancellationToken.None);
}
Real-time Conversation with Streaming
public class ConversationStreamHandler
{
private readonly INLPProvider _nlp;
private readonly ITTSProvider _tts;
public async IAsyncEnumerable<byte[]> ProcessAndSynthesize(string userMessage)
{
// Stream NLP response tokens
await foreach (var token in _nlp.StreamResponseAsync(userMessage))
{
// Convert each token to speech immediately
await foreach (var audioChunk in _tts.StreamSynthesizeAsync(token))
{
yield return audioChunk;
}
}
}
}
Voice Customization
Custom Voice Settings (ElevenLabs)
public class CustomVoiceService
{
private readonly ElevenLabsClient _client;
public async Task<byte[]> SynthesizeWithEmotionAsync(
string text,
string emotion)
{
var voiceSettings = emotion.ToLower() switch
{
"excited" => new VoiceSettings
{
Stability = 0.3,
SimilarityBoost = 0.9,
Style = 0.8,
UseSpeakerBoost = true
},
"calm" => new VoiceSettings
{
Stability = 0.8,
SimilarityBoost = 0.6,
Style = 0.2,
UseSpeakerBoost = false
},
"professional" => new VoiceSettings
{
Stability = 0.7,
SimilarityBoost = 0.7,
Style = 0.0,
UseSpeakerBoost = true
},
_ => new VoiceSettings()
};
var request = new TextToSpeechRequest
{
Text = text,
ModelId = "eleven_turbo_v2",
VoiceSettings = voiceSettings
};
using var audioStream = await _client.TextToSpeech.ConvertAsync(
"Rachel",
request,
"pcm_16000");
using var memoryStream = new MemoryStream();
await audioStream.CopyToAsync(memoryStream);
return memoryStream.ToArray();
}
}
Audio Post-Processing
Normalize Volume
public class AudioProcessor
{
public byte[] NormalizeVolume(byte[] audioData, float targetLevel = 0.8f)
{
var samples = new short[audioData.Length / 2];
Buffer.BlockCopy(audioData, 0, samples, 0, audioData.Length);
// Find peak
float peak = samples.Max(s => Math.Abs(s));
float scale = (short.MaxValue * targetLevel) / peak;
// Normalize
for (int i = 0; i < samples.Length; i++)
{
samples[i] = (short)Math.Clamp(samples[i] * scale, short.MinValue, short.MaxValue);
}
var result = new byte[audioData.Length];
Buffer.BlockCopy(samples, 0, result, 0, audioData.Length);
return result;
}
}
Format Conversion
public class AudioConverter
{
public byte[] ConvertPCMToMP3(byte[] pcmData, int sampleRate = 16000)
{
// Using NAudio
using var pcmStream = new MemoryStream(pcmData);
using var reader = new RawSourceWaveStream(
pcmStream,
new WaveFormat(sampleRate, 16, 1));
using var mp3Stream = new MemoryStream();
using var writer = new LameMP3FileWriter(mp3Stream, reader.WaveFormat, 128);
reader.CopyTo(writer);
return mp3Stream.ToArray();
}
}
Error Handling and Fallback
public class FallbackTTSProvider : ITTSProvider
{
private readonly ITTSProvider _primary;
private readonly ITTSProvider _fallback;
private readonly ILogger<FallbackTTSProvider> _logger;
public async Task<byte[]> SynthesizeAsync(string text, VoiceOptions? options = null)
{
try
{
return await _primary.SynthesizeAsync(text, options);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Primary TTS failed, using fallback");
return await _fallback.SynthesizeAsync(text, options);
}
}
public async IAsyncEnumerable<byte[]> StreamSynthesizeAsync(
string text,
VoiceOptions? options = null)
{
IAsyncEnumerable<byte[]> stream;
try
{
stream = _primary.StreamSynthesizeAsync(text, options);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Primary TTS streaming failed, using fallback");
stream = _fallback.StreamSynthesizeAsync(text, options);
}
await foreach (var chunk in stream)
{
yield return chunk;
}
}
}
Caching for Performance
public class CachedTTSProvider : ITTSProvider
{
private readonly ITTSProvider _provider;
private readonly IDistributedCache _cache;
private readonly ILogger<CachedTTSProvider> _logger;
public async Task<byte[]> SynthesizeAsync(string text, VoiceOptions? options = null)
{
var cacheKey = GenerateCacheKey(text, options);
var cached = await _cache.GetAsync(cacheKey);
if (cached != null)
{
_logger.LogInformation("Cache hit for text: {Text}", text.Substring(0, 20));
return cached;
}
var audio = await _provider.SynthesizeAsync(text, options);
await _cache.SetAsync(
cacheKey,
audio,
new DistributedCacheEntryOptions
{
AbsoluteExpirationRelativeToNow = TimeSpan.FromHours(24)
});
return audio;
}
private string GenerateCacheKey(string text, VoiceOptions? options)
{
var optionsHash = options != null
? $"{options.VoiceId}_{options.Format}"
: "default";
return $"tts:{optionsHash}:{ComputeHash(text)}";
}
private string ComputeHash(string text)
{
using var sha256 = SHA256.Create();
var bytes = Encoding.UTF8.GetBytes(text);
var hash = sha256.ComputeHash(bytes);
return Convert.ToBase64String(hash);
}
}
Testing
public class TTSProviderTests
{
[Fact]
public async Task Synthesize_ValidText_ReturnsAudio()
{
// Arrange
var mockConfig = new Mock<IConfiguration>();
mockConfig.Setup(x => x["ElevenLabs:ApiKey"]).Returns("test-key");
var provider = new ElevenLabsTTSProvider(
mockConfig.Object,
Mock.Of<ILogger<ElevenLabsTTSProvider>>());
// Act
var audio = await provider.SynthesizeAsync("Hello world");
// Assert
Assert.NotEmpty(audio);
Assert.True(audio.Length > 1000); // Reasonable audio size
}
[Fact]
public async Task StreamSynthesize_ValidText_ReturnsChunks()
{
// Arrange
var provider = CreateTestProvider();
var chunks = new List<byte[]>();
// Act
await foreach (var chunk in provider.StreamSynthesizeAsync("Hello"))
{
chunks.Add(chunk);
}
// Assert
Assert.NotEmpty(chunks);
Assert.True(chunks.Sum(c => c.Length) > 1000);
}
}
Performance Benchmarks
Based on production testing:
| Provider | Latency (first chunk) | Quality | Cost per 1K chars |
|---|---|---|---|
| ElevenLabs | 300-500ms | Excellent | $0.18-0.30 |
| Deepgram Aura | 200-350ms | Very Good | $0.015 |
| Azure Neural | 400-600ms | Very Good | $0.015 |
| Google WaveNet | 500-700ms | Excellent | $0.016 |
Best Practices
- Stream when possible: Reduces perceived latency by 50%
- Cache common phrases: "Hello", "How can I help", etc.
- Pre-generate audio: For known responses in menu systems
- Normalize volume: Ensure consistent audio levels
- Handle errors gracefully: Always have a fallback provider
Comparison
| Feature | ElevenLabs | Deepgram Aura |
|---|---|---|
| Latency | 300-500ms | 200-350ms |
| Quality | ★★★★★ | ★★★★☆ |
| Voice Cloning | Yes | No |
| Streaming | Yes | Yes |
| Pricing | See here | See here |
| Best For | Premium quality | Low latency |
Related Artciles
Questions? Drop them in the comments!
Top comments (0)