DEV Community

Aatif G.
Aatif G.

Posted on

ElevenLabs vs Deepgram Aura: Text-to-Speech Implementation Guide in .NET 8

Introduction

This guide shows you how to implement Text-to-Speech using ElevenLabs and Deepgram Aura in a .NET 8 application, with support for streaming audio and voice customization.

Interface Definition

public interface ITTSProvider
{
    Task<byte[]> SynthesizeAsync(string text, VoiceOptions? options = null);
    IAsyncEnumerable<byte[]> StreamSynthesizeAsync(string text, VoiceOptions? options = null);
    Task<List<Voice>> GetAvailableVoicesAsync();
}

public record VoiceOptions(
    string? VoiceId = null,
    float? Speed = null,
    float? Pitch = null,
    AudioFormat Format = AudioFormat.PCM16);

public record Voice(string Id, string Name, string Description, string? PreviewUrl);

public enum AudioFormat
{
    PCM16,
    MP3,
    OGG
}
Enter fullscreen mode Exit fullscreen mode

ElevenLabs Implementation

Setup

dotnet add package ElevenLabs
Enter fullscreen mode Exit fullscreen mode

Configuration

// appsettings.json
{
  "ElevenLabs": {
    "ApiKey": "your-api-key",
    "DefaultVoice": "Rachel",
    "Model": "eleven_turbo_v2"
  }
}
Enter fullscreen mode Exit fullscreen mode

Implementation

using ElevenLabs;
using ElevenLabs.Models;

public class ElevenLabsTTSProvider : ITTSProvider
{
    private readonly ElevenLabsClient _client;
    private readonly string _defaultVoice;
    private readonly string _model;
    private readonly ILogger<ElevenLabsTTSProvider> _logger;

    public ElevenLabsTTSProvider(
        IConfiguration configuration,
        ILogger<ElevenLabsTTSProvider> logger)
    {
        var apiKey = configuration["ElevenLabs:ApiKey"];
        _defaultVoice = configuration["ElevenLabs:DefaultVoice"] ?? "Rachel";
        _model = configuration["ElevenLabs:Model"] ?? "eleven_turbo_v2";
        _client = new ElevenLabsClient(apiKey);
        _logger = logger;
    }

    public async Task<byte[]> SynthesizeAsync(string text, VoiceOptions? options = null)
    {
        var voiceId = options?.VoiceId ?? _defaultVoice;

        var request = new TextToSpeechRequest
        {
            Text = text,
            ModelId = _model,
            VoiceSettings = new VoiceSettings
            {
                Stability = 0.5,
                SimilarityBoost = 0.75,
                Style = 0.0,
                UseSpeakerBoost = true
            }
        };

        var outputFormat = MapAudioFormat(options?.Format ?? AudioFormat.PCM16);

        using var audioStream = await _client.TextToSpeech.ConvertAsync(
            voiceId,
            request,
            outputFormat);

        using var memoryStream = new MemoryStream();
        await audioStream.CopyToAsync(memoryStream);

        _logger.LogInformation(
            "Generated {Bytes} bytes of audio for text length {Length}",
            memoryStream.Length,
            text.Length);

        return memoryStream.ToArray();
    }

    public async IAsyncEnumerable<byte[]> StreamSynthesizeAsync(
        string text,
        VoiceOptions? options = null)
    {
        var voiceId = options?.VoiceId ?? _defaultVoice;

        var request = new TextToSpeechRequest
        {
            Text = text,
            ModelId = _model,
            VoiceSettings = new VoiceSettings
            {
                Stability = 0.5,
                SimilarityBoost = 0.75
            }
        };

        var outputFormat = MapAudioFormat(options?.Format ?? AudioFormat.PCM16);

        await using var audioStream = await _client.TextToSpeech.ConvertStreamAsync(
            voiceId,
            request,
            outputFormat);

        var buffer = new byte[4096];
        int bytesRead;

        while ((bytesRead = await audioStream.ReadAsync(buffer, 0, buffer.Length)) > 0)
        {
            var chunk = new byte[bytesRead];
            Array.Copy(buffer, chunk, bytesRead);
            yield return chunk;
        }
    }

    public async Task<List<Voice>> GetAvailableVoicesAsync()
    {
        var voices = await _client.Voices.GetAllAsync();

        return voices.Select(v => new Voice(
            Id: v.VoiceId,
            Name: v.Name,
            Description: v.Description ?? string.Empty,
            PreviewUrl: v.PreviewUrl
        )).ToList();
    }

    private string MapAudioFormat(AudioFormat format)
    {
        return format switch
        {
            AudioFormat.PCM16 => "pcm_16000",
            AudioFormat.MP3 => "mp3_44100_128",
            AudioFormat.OGG => "ogg_opus",
            _ => "pcm_16000"
        };
    }
}
Enter fullscreen mode Exit fullscreen mode

Deepgram Aura Implementation

Setup

dotnet add package Deepgram
Enter fullscreen mode Exit fullscreen mode

Configuration

// appsettings.json
{
  "Deepgram": {
    "ApiKey": "your-api-key",
    "DefaultVoice": "aura-asteria-en"
  }
}
Enter fullscreen mode Exit fullscreen mode

Implementation

using Deepgram;
using Deepgram.Models;

public class DeepgramTTSProvider : ITTSProvider
{
    private readonly DeepgramClient _client;
    private readonly string _defaultVoice;
    private readonly ILogger<DeepgramTTSProvider> _logger;

    public DeepgramTTSProvider(
        IConfiguration configuration,
        ILogger<DeepgramTTSProvider> logger)
    {
        var apiKey = configuration["Deepgram:ApiKey"];
        _defaultVoice = configuration["Deepgram:DefaultVoice"] ?? "aura-asteria-en";
        _client = new DeepgramClient(apiKey);
        _logger = logger;
    }

    public async Task<byte[]> SynthesizeAsync(string text, VoiceOptions? options = null)
    {
        var speakOptions = new SpeakOptions
        {
            Model = options?.VoiceId ?? _defaultVoice,
            Encoding = MapAudioFormat(options?.Format ?? AudioFormat.PCM16),
            SampleRate = 16000
        };

        var response = await _client.Speak.v("1").Synthesize(
            new TextSource(text),
            speakOptions);

        _logger.LogInformation(
            "Generated audio for text length {Length}",
            text.Length);

        return response.Data;
    }

    public async IAsyncEnumerable<byte[]> StreamSynthesizeAsync(
        string text,
        VoiceOptions? options = null)
    {
        var speakOptions = new SpeakOptions
        {
            Model = options?.VoiceId ?? _defaultVoice,
            Encoding = MapAudioFormat(options?.Format ?? AudioFormat.PCM16),
            SampleRate = 16000
        };

        await using var stream = await _client.Speak.v("1").SynthesizeStream(
            new TextSource(text),
            speakOptions);

        var buffer = new byte[4096];
        int bytesRead;

        while ((bytesRead = await stream.ReadAsync(buffer, 0, buffer.Length)) > 0)
        {
            var chunk = new byte[bytesRead];
            Array.Copy(buffer, chunk, bytesRead);
            yield return chunk;
        }
    }

    public async Task<List<Voice>> GetAvailableVoicesAsync()
    {
        // Deepgram Aura voices (as of 2024)
        return await Task.FromResult(new List<Voice>
        {
            new Voice("aura-asteria-en", "Asteria", "Female, warm and friendly", null),
            new Voice("aura-luna-en", "Luna", "Female, professional", null),
            new Voice("aura-stella-en", "Stella", "Female, energetic", null),
            new Voice("aura-athena-en", "Athena", "Female, clear", null),
            new Voice("aura-hera-en", "Hera", "Female, authoritative", null),
            new Voice("aura-orion-en", "Orion", "Male, deep", null),
            new Voice("aura-arcas-en", "Arcas", "Male, neutral", null),
            new Voice("aura-perseus-en", "Perseus", "Male, confident", null),
            new Voice("aura-angus-en", "Angus", "Male, friendly", null),
            new Voice("aura-orpheus-en", "Orpheus", "Male, smooth", null),
            new Voice("aura-helios-en", "Helios", "Male, warm", null),
            new Voice("aura-zeus-en", "Zeus", "Male, powerful", null)
        });
    }

    private string MapAudioFormat(AudioFormat format)
    {
        return format switch
        {
            AudioFormat.PCM16 => "linear16",
            AudioFormat.MP3 => "mp3",
            AudioFormat.OGG => "opus",
            _ => "linear16"
        };
    }
}
Enter fullscreen mode Exit fullscreen mode

Dependency Injection Setup

// Program.cs
builder.Services.AddSingleton<ITTSProvider, ElevenLabsTTSProvider>();
// OR
builder.Services.AddSingleton<ITTSProvider, DeepgramTTSProvider>();
Enter fullscreen mode Exit fullscreen mode

Usage Examples

Basic Text-to-Speech

[ApiController]
[Route("api/[controller]")]
public class SpeechController : ControllerBase
{
    private readonly ITTSProvider _tts;

    [HttpPost("synthesize")]
    public async Task<IActionResult> Synthesize([FromBody] SynthesizeRequest request)
    {
        var options = new VoiceOptions(
            VoiceId: request.VoiceId,
            Format: AudioFormat.MP3);

        var audioData = await _tts.SynthesizeAsync(request.Text, options);

        return File(audioData, "audio/mpeg", "speech.mp3");
    }

    [HttpGet("voices")]
    public async Task<IActionResult> GetVoices()
    {
        var voices = await _tts.GetAvailableVoicesAsync();
        return Ok(voices);
    }
}

public record SynthesizeRequest(string Text, string? VoiceId);
Enter fullscreen mode Exit fullscreen mode

Streaming Audio via WebSocket

[HttpGet("stream")]
public async Task StreamAudio([FromQuery] string text, [FromQuery] string? voiceId)
{
    if (!HttpContext.WebSockets.IsWebSocketRequest)
    {
        HttpContext.Response.StatusCode = 400;
        return;
    }

    using var webSocket = await HttpContext.WebSockets.AcceptWebSocketAsync();

    var options = new VoiceOptions(VoiceId: voiceId);

    await foreach (var audioChunk in _tts.StreamSynthesizeAsync(text, options))
    {
        await webSocket.SendAsync(
            new ArraySegment<byte>(audioChunk),
            WebSocketMessageType.Binary,
            true,
            CancellationToken.None);
    }

    await webSocket.CloseAsync(
        WebSocketCloseStatus.NormalClosure,
        "Completed",
        CancellationToken.None);
}
Enter fullscreen mode Exit fullscreen mode

Real-time Conversation with Streaming

public class ConversationStreamHandler
{
    private readonly INLPProvider _nlp;
    private readonly ITTSProvider _tts;

    public async IAsyncEnumerable<byte[]> ProcessAndSynthesize(string userMessage)
    {
        // Stream NLP response tokens
        await foreach (var token in _nlp.StreamResponseAsync(userMessage))
        {
            // Convert each token to speech immediately
            await foreach (var audioChunk in _tts.StreamSynthesizeAsync(token))
            {
                yield return audioChunk;
            }
        }
    }
}
Enter fullscreen mode Exit fullscreen mode

Voice Customization

Custom Voice Settings (ElevenLabs)

public class CustomVoiceService
{
    private readonly ElevenLabsClient _client;

    public async Task<byte[]> SynthesizeWithEmotionAsync(
        string text,
        string emotion)
    {
        var voiceSettings = emotion.ToLower() switch
        {
            "excited" => new VoiceSettings
            {
                Stability = 0.3,
                SimilarityBoost = 0.9,
                Style = 0.8,
                UseSpeakerBoost = true
            },
            "calm" => new VoiceSettings
            {
                Stability = 0.8,
                SimilarityBoost = 0.6,
                Style = 0.2,
                UseSpeakerBoost = false
            },
            "professional" => new VoiceSettings
            {
                Stability = 0.7,
                SimilarityBoost = 0.7,
                Style = 0.0,
                UseSpeakerBoost = true
            },
            _ => new VoiceSettings()
        };

        var request = new TextToSpeechRequest
        {
            Text = text,
            ModelId = "eleven_turbo_v2",
            VoiceSettings = voiceSettings
        };

        using var audioStream = await _client.TextToSpeech.ConvertAsync(
            "Rachel",
            request,
            "pcm_16000");

        using var memoryStream = new MemoryStream();
        await audioStream.CopyToAsync(memoryStream);
        return memoryStream.ToArray();
    }
}
Enter fullscreen mode Exit fullscreen mode

Audio Post-Processing

Normalize Volume

public class AudioProcessor
{
    public byte[] NormalizeVolume(byte[] audioData, float targetLevel = 0.8f)
    {
        var samples = new short[audioData.Length / 2];
        Buffer.BlockCopy(audioData, 0, samples, 0, audioData.Length);

        // Find peak
        float peak = samples.Max(s => Math.Abs(s));
        float scale = (short.MaxValue * targetLevel) / peak;

        // Normalize
        for (int i = 0; i < samples.Length; i++)
        {
            samples[i] = (short)Math.Clamp(samples[i] * scale, short.MinValue, short.MaxValue);
        }

        var result = new byte[audioData.Length];
        Buffer.BlockCopy(samples, 0, result, 0, audioData.Length);
        return result;
    }
}
Enter fullscreen mode Exit fullscreen mode

Format Conversion

public class AudioConverter
{
    public byte[] ConvertPCMToMP3(byte[] pcmData, int sampleRate = 16000)
    {
        // Using NAudio
        using var pcmStream = new MemoryStream(pcmData);
        using var reader = new RawSourceWaveStream(
            pcmStream,
            new WaveFormat(sampleRate, 16, 1));

        using var mp3Stream = new MemoryStream();
        using var writer = new LameMP3FileWriter(mp3Stream, reader.WaveFormat, 128);

        reader.CopyTo(writer);
        return mp3Stream.ToArray();
    }
}
Enter fullscreen mode Exit fullscreen mode

Error Handling and Fallback

public class FallbackTTSProvider : ITTSProvider
{
    private readonly ITTSProvider _primary;
    private readonly ITTSProvider _fallback;
    private readonly ILogger<FallbackTTSProvider> _logger;

    public async Task<byte[]> SynthesizeAsync(string text, VoiceOptions? options = null)
    {
        try
        {
            return await _primary.SynthesizeAsync(text, options);
        }
        catch (Exception ex)
        {
            _logger.LogWarning(ex, "Primary TTS failed, using fallback");
            return await _fallback.SynthesizeAsync(text, options);
        }
    }

    public async IAsyncEnumerable<byte[]> StreamSynthesizeAsync(
        string text,
        VoiceOptions? options = null)
    {
        IAsyncEnumerable<byte[]> stream;
        try
        {
            stream = _primary.StreamSynthesizeAsync(text, options);
        }
        catch (Exception ex)
        {
            _logger.LogWarning(ex, "Primary TTS streaming failed, using fallback");
            stream = _fallback.StreamSynthesizeAsync(text, options);
        }

        await foreach (var chunk in stream)
        {
            yield return chunk;
        }
    }
}
Enter fullscreen mode Exit fullscreen mode

Caching for Performance

public class CachedTTSProvider : ITTSProvider
{
    private readonly ITTSProvider _provider;
    private readonly IDistributedCache _cache;
    private readonly ILogger<CachedTTSProvider> _logger;

    public async Task<byte[]> SynthesizeAsync(string text, VoiceOptions? options = null)
    {
        var cacheKey = GenerateCacheKey(text, options);

        var cached = await _cache.GetAsync(cacheKey);
        if (cached != null)
        {
            _logger.LogInformation("Cache hit for text: {Text}", text.Substring(0, 20));
            return cached;
        }

        var audio = await _provider.SynthesizeAsync(text, options);

        await _cache.SetAsync(
            cacheKey,
            audio,
            new DistributedCacheEntryOptions
            {
                AbsoluteExpirationRelativeToNow = TimeSpan.FromHours(24)
            });

        return audio;
    }

    private string GenerateCacheKey(string text, VoiceOptions? options)
    {
        var optionsHash = options != null
            ? $"{options.VoiceId}_{options.Format}"
            : "default";
        return $"tts:{optionsHash}:{ComputeHash(text)}";
    }

    private string ComputeHash(string text)
    {
        using var sha256 = SHA256.Create();
        var bytes = Encoding.UTF8.GetBytes(text);
        var hash = sha256.ComputeHash(bytes);
        return Convert.ToBase64String(hash);
    }
}
Enter fullscreen mode Exit fullscreen mode

Testing

public class TTSProviderTests
{
    [Fact]
    public async Task Synthesize_ValidText_ReturnsAudio()
    {
        // Arrange
        var mockConfig = new Mock<IConfiguration>();
        mockConfig.Setup(x => x["ElevenLabs:ApiKey"]).Returns("test-key");

        var provider = new ElevenLabsTTSProvider(
            mockConfig.Object,
            Mock.Of<ILogger<ElevenLabsTTSProvider>>());

        // Act
        var audio = await provider.SynthesizeAsync("Hello world");

        // Assert
        Assert.NotEmpty(audio);
        Assert.True(audio.Length > 1000); // Reasonable audio size
    }

    [Fact]
    public async Task StreamSynthesize_ValidText_ReturnsChunks()
    {
        // Arrange
        var provider = CreateTestProvider();
        var chunks = new List<byte[]>();

        // Act
        await foreach (var chunk in provider.StreamSynthesizeAsync("Hello"))
        {
            chunks.Add(chunk);
        }

        // Assert
        Assert.NotEmpty(chunks);
        Assert.True(chunks.Sum(c => c.Length) > 1000);
    }
}
Enter fullscreen mode Exit fullscreen mode

Performance Benchmarks

Based on production testing:

Provider Latency (first chunk) Quality Cost per 1K chars
ElevenLabs 300-500ms Excellent $0.18-0.30
Deepgram Aura 200-350ms Very Good $0.015
Azure Neural 400-600ms Very Good $0.015
Google WaveNet 500-700ms Excellent $0.016

Best Practices

  1. Stream when possible: Reduces perceived latency by 50%
  2. Cache common phrases: "Hello", "How can I help", etc.
  3. Pre-generate audio: For known responses in menu systems
  4. Normalize volume: Ensure consistent audio levels
  5. Handle errors gracefully: Always have a fallback provider

Comparison

Feature ElevenLabs Deepgram Aura
Latency 300-500ms 200-350ms
Quality ★★★★★ ★★★★☆
Voice Cloning Yes No
Streaming Yes Yes
Pricing See here See here
Best For Premium quality Low latency

Related Artciles

Questions? Drop them in the comments!

Top comments (0)