DEV Community

Aatif G.
Aatif G.

Posted on

Deepgram vs AssemblyAI: Speech-to-Text Implementation in .NET 8

Introduction

This guide shows you how to implement real-time Speech-to-Text using Deepgram and AssemblyAI in a .NET 8 application. Both providers offer excellent accuracy and low latency, but with different strengths.

Deepgram Implementation

Deepgram excels at low-latency streaming transcription, perfect for real-time conversations.

Setup

dotnet add package Deepgram
Enter fullscreen mode Exit fullscreen mode

Configuration

// appsettings.json
{
  "Deepgram": {
    "ApiKey": "your-api-key"
  }
}
Enter fullscreen mode Exit fullscreen mode

Interface Definition

public interface ISTTProvider
{
    Task<string> TranscribeAsync(Stream audioStream);
    IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream);
}
Enter fullscreen mode Exit fullscreen mode

Implementation

using Deepgram;
using Deepgram.Models;

public class DeepgramSTTProvider : ISTTProvider
{
    private readonly DeepgramClient _client;
    private readonly ILogger<DeepgramSTTProvider> _logger;

    public DeepgramSTTProvider(
        IConfiguration configuration,
        ILogger<DeepgramSTTProvider> logger)
    {
        _client = new DeepgramClient(configuration["Deepgram:ApiKey"]);
        _logger = logger;
    }

    public async Task<string> TranscribeAsync(Stream audioStream)
    {
        var options = new PrerecordedTranscriptionOptions
        {
            Model = "nova-2",
            Language = "en-US",
            Punctuate = true,
            Diarize = false
        };

        var response = await _client.Transcription.Prerecorded.TranscribeAsync(
            new UrlSource("STREAM"),
            options);

        return response.Results.Channels[0].Alternatives[0].Transcript;
    }

    public async IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream)
    {
        var options = new LiveTranscriptionOptions
        {
            Model = "nova-2",
            Language = "en-US",
            Punctuate = true,
            InterimResults = true,
            Endpointing = 300 // ms of silence before finalizing
        };

        using var connection = _client.CreateLiveTranscriptionConnection(options);
        var transcriptBuffer = new StringBuilder();
        var transcriptReady = new TaskCompletionSource<string>();

        connection.TranscriptReceived += (sender, transcript) =>
        {
            if (transcript.IsFinal)
            {
                var text = transcript.Channel.Alternatives[0].Transcript;
                _logger.LogInformation("Final transcript: {Text}", text);
                transcriptReady.TrySetResult(text);
            }
        };

        connection.ConnectionError += (sender, error) =>
        {
            _logger.LogError("Connection error: {Error}", error.Message);
            transcriptReady.TrySetException(new Exception(error.Message));
        };

        await connection.StartConnectionAsync();

        // Stream audio in chunks
        var buffer = new byte[8192];
        int bytesRead;
        while ((bytesRead = await audioStream.ReadAsync(buffer, 0, buffer.Length)) > 0)
        {
            await connection.SendAsync(buffer.Take(bytesRead).ToArray());
        }

        await connection.FinishAsync();

        var result = await transcriptReady.Task;
        yield return result;
    }
}
Enter fullscreen mode Exit fullscreen mode

AssemblyAI Implementation

AssemblyAI offers advanced features like speaker diarization and sentiment analysis.

Setup

dotnet add package AssemblyAI
Enter fullscreen mode Exit fullscreen mode

Implementation

using AssemblyAI;
using AssemblyAI.Transcripts;

public class AssemblyAISTTProvider : ISTTProvider
{
    private readonly AssemblyAIClient _client;
    private readonly ILogger<AssemblyAISTTProvider> _logger;

    public AssemblyAISTTProvider(
        IConfiguration configuration,
        ILogger<AssemblyAISTTProvider> logger)
    {
        _client = new AssemblyAIClient(configuration["AssemblyAI:ApiKey"]);
        _logger = logger;
    }

    public async Task<string> TranscribeAsync(Stream audioStream)
    {
        // Upload audio
        var uploadUrl = await _client.Files.UploadAsync(audioStream);

        // Create transcript
        var transcript = await _client.Transcripts.TranscribeAsync(new TranscriptParams
        {
            AudioUrl = uploadUrl,
            LanguageCode = TranscriptLanguageCode.En,
            PunctuationEnabled = true,
            FormatTextEnabled = true,
            SpeakerLabelsEnabled = true // Diarization
        });

        // Wait for completion
        transcript = await _client.Transcripts.WaitUntilReadyAsync(transcript.Id);

        return transcript.Text;
    }

    public async IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream)
    {
        using var realtime = _client.Realtime.CreateTranscriber();
        var transcripts = new List<string>();

        realtime.SessionBegins += (sender, e) =>
        {
            _logger.LogInformation("Session started: {SessionId}", e.SessionId);
        };

        realtime.PartialTranscriptReceived += (sender, transcript) =>
        {
            _logger.LogDebug("Partial: {Text}", transcript.Text);
        };

        realtime.FinalTranscriptReceived += (sender, transcript) =>
        {
            _logger.LogInformation("Final: {Text}", transcript.Text);
            transcripts.Add(transcript.Text);
        };

        realtime.ErrorReceived += (sender, error) =>
        {
            _logger.LogError("Error: {Error}", error.Error);
        };

        await realtime.ConnectAsync();

        // Stream audio
        var buffer = new byte[8192];
        int bytesRead;
        while ((bytesRead = await audioStream.ReadAsync(buffer, 0, buffer.Length)) > 0)
        {
            await realtime.SendAudioAsync(buffer.Take(bytesRead).ToArray());
        }

        await realtime.CloseAsync();

        foreach (var transcript in transcripts)
        {
            yield return transcript;
        }
    }
}
Enter fullscreen mode Exit fullscreen mode

Dependency Injection Setup

// Program.cs
builder.Services.AddSingleton<ISTTProvider, DeepgramSTTProvider>();
// OR
builder.Services.AddSingleton<ISTTProvider, AssemblyAISTTProvider>();
Enter fullscreen mode Exit fullscreen mode

Usage Example

[ApiController]
[Route("api/[controller]")]
public class TranscriptionController : ControllerBase
{
    private readonly ISTTProvider _stt;

    public TranscriptionController(ISTTProvider stt)
    {
        _stt = stt;
    }

    [HttpPost("transcribe")]
    public async Task<IActionResult> Transcribe(IFormFile audio)
    {
        using var stream = audio.OpenReadStream();
        var transcript = await _stt.TranscribeAsync(stream);
        return Ok(new { transcript });
    }

    [HttpPost("stream")]
    public async Task StreamTranscribe()
    {
        if (!HttpContext.WebSockets.IsWebSocketRequest)
            return;

        using var webSocket = await HttpContext.WebSockets.AcceptWebSocketAsync();
        var audioStream = new MemoryStream();

        // Receive audio via WebSocket
        var buffer = new byte[1024 * 4];
        while (webSocket.State == WebSocketState.Open)
        {
            var result = await webSocket.ReceiveAsync(
                new ArraySegment<byte>(buffer),
                CancellationToken.None);

            if (result.MessageType == WebSocketMessageType.Binary)
            {
                await audioStream.WriteAsync(buffer, 0, result.Count);
            }
            else if (result.MessageType == WebSocketMessageType.Close)
            {
                break;
            }
        }

        audioStream.Position = 0;
        await foreach (var transcript in _stt.StreamTranscribeAsync(audioStream))
        {
            var response = Encoding.UTF8.GetBytes(transcript);
            await webSocket.SendAsync(
                new ArraySegment<byte>(response),
                WebSocketMessageType.Text,
                true,
                CancellationToken.None);
        }
    }
}
Enter fullscreen mode Exit fullscreen mode

Error Handling with Fallback

public class FallbackSTTProvider : ISTTProvider
{
    private readonly ISTTProvider _primary;
    private readonly ISTTProvider _fallback;
    private readonly ILogger<FallbackSTTProvider> _logger;

    public FallbackSTTProvider(
        IEnumerable<ISTTProvider> providers,
        ILogger<FallbackSTTProvider> logger)
    {
        var providerList = providers.ToList();
        _primary = providerList[0];
        _fallback = providerList[1];
        _logger = logger;
    }

    public async Task<string> TranscribeAsync(Stream audioStream)
    {
        try
        {
            return await _primary.TranscribeAsync(audioStream);
        }
        catch (Exception ex)
        {
            _logger.LogWarning(ex, "Primary STT failed, using fallback");
            audioStream.Position = 0; // Reset stream
            return await _fallback.TranscribeAsync(audioStream);
        }
    }

    public async IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream)
    {
        IAsyncEnumerable<string> stream;
        try
        {
            stream = _primary.StreamTranscribeAsync(audioStream);
        }
        catch (Exception ex)
        {
            _logger.LogWarning(ex, "Primary STT failed, using fallback");
            audioStream.Position = 0;
            stream = _fallback.StreamTranscribeAsync(audioStream);
        }

        await foreach (var transcript in stream)
        {
            yield return transcript;
        }
    }
}
Enter fullscreen mode Exit fullscreen mode

Testing

public class STTProviderTests
{
    [Fact]
    public async Task Transcribe_ValidAudio_ReturnsText()
    {
        // Arrange
        var mockConfig = new Mock<IConfiguration>();
        mockConfig.Setup(x => x["Deepgram:ApiKey"]).Returns("test-key");

        var provider = new DeepgramSTTProvider(
            mockConfig.Object,
            Mock.Of<ILogger<DeepgramSTTProvider>>());

        var audioStream = File.OpenRead("test-audio.wav");

        // Act
        var result = await provider.TranscribeAsync(audioStream);

        // Assert
        Assert.NotEmpty(result);
    }
}
Enter fullscreen mode Exit fullscreen mode

Performance Tips

  1. Use streaming for real-time: StreamTranscribeAsync for live conversations
  2. Batch processing: Use TranscribeAsync for recorded audio
  3. Audio format: PCM16 at 16kHz for best results
  4. Buffer size: 8KB chunks for optimal streaming
  5. Endpointing: 300ms works well for natural pauses

Comparison

Feature Deepgram AssemblyAI
Latency 200-400ms 300-500ms
Accuracy Excellent Excellent
Diarization Optional Built-in
Pricing See here See here
Best For Speed Features

Related Articles


Questions? Drop them in the comments!

Top comments (0)