Aatif G.

Posted on Dec 17, 2025

Deepgram vs AssemblyAI: Speech-to-Text Implementation in .NET 8

#stt #ai #dotnet #deepgram

Introduction

This guide shows you how to implement real-time Speech-to-Text using Deepgram and AssemblyAI in a .NET 8 application. Both providers offer excellent accuracy and low latency, but with different strengths.

Deepgram Implementation

Deepgram excels at low-latency streaming transcription, perfect for real-time conversations.

Setup

dotnet add package Deepgram

Configuration

// appsettings.json
{
  "Deepgram": {
    "ApiKey": "your-api-key"
  }
}

Interface Definition

public interface ISTTProvider
{
    Task<string> TranscribeAsync(Stream audioStream);
    IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream);
}

Implementation

using Deepgram;
using Deepgram.Models;

public class DeepgramSTTProvider : ISTTProvider
{
    private readonly DeepgramClient _client;
    private readonly ILogger<DeepgramSTTProvider> _logger;

    public DeepgramSTTProvider(
        IConfiguration configuration,
        ILogger<DeepgramSTTProvider> logger)
    {
        _client = new DeepgramClient(configuration["Deepgram:ApiKey"]);
        _logger = logger;
    }

    public async Task<string> TranscribeAsync(Stream audioStream)
    {
        var options = new PrerecordedTranscriptionOptions
        {
            Model = "nova-2",
            Language = "en-US",
            Punctuate = true,
            Diarize = false
        };

        var response = await _client.Transcription.Prerecorded.TranscribeAsync(
            new UrlSource("STREAM"),
            options);

        return response.Results.Channels[0].Alternatives[0].Transcript;
    }

    public async IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream)
    {
        var options = new LiveTranscriptionOptions
        {
            Model = "nova-2",
            Language = "en-US",
            Punctuate = true,
            InterimResults = true,
            Endpointing = 300 // ms of silence before finalizing
        };

        using var connection = _client.CreateLiveTranscriptionConnection(options);
        var transcriptBuffer = new StringBuilder();
        var transcriptReady = new TaskCompletionSource<string>();

        connection.TranscriptReceived += (sender, transcript) =>
        {
            if (transcript.IsFinal)
            {
                var text = transcript.Channel.Alternatives[0].Transcript;
                _logger.LogInformation("Final transcript: {Text}", text);
                transcriptReady.TrySetResult(text);
            }
        };

        connection.ConnectionError += (sender, error) =>
        {
            _logger.LogError("Connection error: {Error}", error.Message);
            transcriptReady.TrySetException(new Exception(error.Message));
        };

        await connection.StartConnectionAsync();

        // Stream audio in chunks
        var buffer = new byte[8192];
        int bytesRead;
        while ((bytesRead = await audioStream.ReadAsync(buffer, 0, buffer.Length)) > 0)
        {
            await connection.SendAsync(buffer.Take(bytesRead).ToArray());
        }

        await connection.FinishAsync();

        var result = await transcriptReady.Task;
        yield return result;
    }
}

AssemblyAI Implementation

AssemblyAI offers advanced features like speaker diarization and sentiment analysis.

Setup

dotnet add package AssemblyAI

Implementation

using AssemblyAI;
using AssemblyAI.Transcripts;

public class AssemblyAISTTProvider : ISTTProvider
{
    private readonly AssemblyAIClient _client;
    private readonly ILogger<AssemblyAISTTProvider> _logger;

    public AssemblyAISTTProvider(
        IConfiguration configuration,
        ILogger<AssemblyAISTTProvider> logger)
    {
        _client = new AssemblyAIClient(configuration["AssemblyAI:ApiKey"]);
        _logger = logger;
    }

    public async Task<string> TranscribeAsync(Stream audioStream)
    {
        // Upload audio
        var uploadUrl = await _client.Files.UploadAsync(audioStream);

        // Create transcript
        var transcript = await _client.Transcripts.TranscribeAsync(new TranscriptParams
        {
            AudioUrl = uploadUrl,
            LanguageCode = TranscriptLanguageCode.En,
            PunctuationEnabled = true,
            FormatTextEnabled = true,
            SpeakerLabelsEnabled = true // Diarization
        });

        // Wait for completion
        transcript = await _client.Transcripts.WaitUntilReadyAsync(transcript.Id);

        return transcript.Text;
    }

    public async IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream)
    {
        using var realtime = _client.Realtime.CreateTranscriber();
        var transcripts = new List<string>();

        realtime.SessionBegins += (sender, e) =>
        {
            _logger.LogInformation("Session started: {SessionId}", e.SessionId);
        };

        realtime.PartialTranscriptReceived += (sender, transcript) =>
        {
            _logger.LogDebug("Partial: {Text}", transcript.Text);
        };

        realtime.FinalTranscriptReceived += (sender, transcript) =>
        {
            _logger.LogInformation("Final: {Text}", transcript.Text);
            transcripts.Add(transcript.Text);
        };

        realtime.ErrorReceived += (sender, error) =>
        {
            _logger.LogError("Error: {Error}", error.Error);
        };

        await realtime.ConnectAsync();

        // Stream audio
        var buffer = new byte[8192];
        int bytesRead;
        while ((bytesRead = await audioStream.ReadAsync(buffer, 0, buffer.Length)) > 0)
        {
            await realtime.SendAudioAsync(buffer.Take(bytesRead).ToArray());
        }

        await realtime.CloseAsync();

        foreach (var transcript in transcripts)
        {
            yield return transcript;
        }
    }
}

Dependency Injection Setup

// Program.cs
builder.Services.AddSingleton<ISTTProvider, DeepgramSTTProvider>();
// OR
builder.Services.AddSingleton<ISTTProvider, AssemblyAISTTProvider>();

Usage Example

[ApiController]
[Route("api/[controller]")]
public class TranscriptionController : ControllerBase
{
    private readonly ISTTProvider _stt;

    public TranscriptionController(ISTTProvider stt)
    {
        _stt = stt;
    }

    [HttpPost("transcribe")]
    public async Task<IActionResult> Transcribe(IFormFile audio)
    {
        using var stream = audio.OpenReadStream();
        var transcript = await _stt.TranscribeAsync(stream);
        return Ok(new { transcript });
    }

    [HttpPost("stream")]
    public async Task StreamTranscribe()
    {
        if (!HttpContext.WebSockets.IsWebSocketRequest)
            return;

        using var webSocket = await HttpContext.WebSockets.AcceptWebSocketAsync();
        var audioStream = new MemoryStream();

        // Receive audio via WebSocket
        var buffer = new byte[1024 * 4];
        while (webSocket.State == WebSocketState.Open)
        {
            var result = await webSocket.ReceiveAsync(
                new ArraySegment<byte>(buffer),
                CancellationToken.None);

            if (result.MessageType == WebSocketMessageType.Binary)
            {
                await audioStream.WriteAsync(buffer, 0, result.Count);
            }
            else if (result.MessageType == WebSocketMessageType.Close)
            {
                break;
            }
        }

        audioStream.Position = 0;
        await foreach (var transcript in _stt.StreamTranscribeAsync(audioStream))
        {
            var response = Encoding.UTF8.GetBytes(transcript);
            await webSocket.SendAsync(
                new ArraySegment<byte>(response),
                WebSocketMessageType.Text,
                true,
                CancellationToken.None);
        }
    }
}

Error Handling with Fallback

public class FallbackSTTProvider : ISTTProvider
{
    private readonly ISTTProvider _primary;
    private readonly ISTTProvider _fallback;
    private readonly ILogger<FallbackSTTProvider> _logger;

    public FallbackSTTProvider(
        IEnumerable<ISTTProvider> providers,
        ILogger<FallbackSTTProvider> logger)
    {
        var providerList = providers.ToList();
        _primary = providerList[0];
        _fallback = providerList[1];
        _logger = logger;
    }

    public async Task<string> TranscribeAsync(Stream audioStream)
    {
        try
        {
            return await _primary.TranscribeAsync(audioStream);
        }
        catch (Exception ex)
        {
            _logger.LogWarning(ex, "Primary STT failed, using fallback");
            audioStream.Position = 0; // Reset stream
            return await _fallback.TranscribeAsync(audioStream);
        }
    }

    public async IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream)
    {
        IAsyncEnumerable<string> stream;
        try
        {
            stream = _primary.StreamTranscribeAsync(audioStream);
        }
        catch (Exception ex)
        {
            _logger.LogWarning(ex, "Primary STT failed, using fallback");
            audioStream.Position = 0;
            stream = _fallback.StreamTranscribeAsync(audioStream);
        }

        await foreach (var transcript in stream)
        {
            yield return transcript;
        }
    }
}

Testing

public class STTProviderTests
{
    [Fact]
    public async Task Transcribe_ValidAudio_ReturnsText()
    {
        // Arrange
        var mockConfig = new Mock<IConfiguration>();
        mockConfig.Setup(x => x["Deepgram:ApiKey"]).Returns("test-key");

        var provider = new DeepgramSTTProvider(
            mockConfig.Object,
            Mock.Of<ILogger<DeepgramSTTProvider>>());

        var audioStream = File.OpenRead("test-audio.wav");

        // Act
        var result = await provider.TranscribeAsync(audioStream);

        // Assert
        Assert.NotEmpty(result);
    }
}

Performance Tips

Use streaming for real-time: StreamTranscribeAsync for live conversations
Batch processing: Use TranscribeAsync for recorded audio
Audio format: PCM16 at 16kHz for best results
Buffer size: 8KB chunks for optimal streaming
Endpointing: 300ms works well for natural pauses

Comparison

Feature	Deepgram	AssemblyAI
Latency	200-400ms	300-500ms
Accuracy	Excellent	Excellent
Diarization	Optional	Built-in
Pricing	See here	See here
Best For	Speed	Features

This Article is a part of Guide to Voice-based Applications on Medium.

Questions? Drop them in the comments!

DEV Community

Deepgram vs AssemblyAI: Speech-to-Text Implementation in .NET 8

Introduction

Deepgram Implementation

Setup

Configuration

Interface Definition

Implementation

AssemblyAI Implementation

Setup

Implementation

Dependency Injection Setup

Usage Example

Error Handling with Fallback

Testing

Performance Tips

Comparison

Related Articles

Top comments (0)