Introduction
This guide shows you how to implement real-time Speech-to-Text using Deepgram and AssemblyAI in a .NET 8 application. Both providers offer excellent accuracy and low latency, but with different strengths.
Deepgram Implementation
Deepgram excels at low-latency streaming transcription, perfect for real-time conversations.
Setup
dotnet add package Deepgram
Configuration
// appsettings.json
{
"Deepgram": {
"ApiKey": "your-api-key"
}
}
Interface Definition
public interface ISTTProvider
{
Task<string> TranscribeAsync(Stream audioStream);
IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream);
}
Implementation
using Deepgram;
using Deepgram.Models;
public class DeepgramSTTProvider : ISTTProvider
{
private readonly DeepgramClient _client;
private readonly ILogger<DeepgramSTTProvider> _logger;
public DeepgramSTTProvider(
IConfiguration configuration,
ILogger<DeepgramSTTProvider> logger)
{
_client = new DeepgramClient(configuration["Deepgram:ApiKey"]);
_logger = logger;
}
public async Task<string> TranscribeAsync(Stream audioStream)
{
var options = new PrerecordedTranscriptionOptions
{
Model = "nova-2",
Language = "en-US",
Punctuate = true,
Diarize = false
};
var response = await _client.Transcription.Prerecorded.TranscribeAsync(
new UrlSource("STREAM"),
options);
return response.Results.Channels[0].Alternatives[0].Transcript;
}
public async IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream)
{
var options = new LiveTranscriptionOptions
{
Model = "nova-2",
Language = "en-US",
Punctuate = true,
InterimResults = true,
Endpointing = 300 // ms of silence before finalizing
};
using var connection = _client.CreateLiveTranscriptionConnection(options);
var transcriptBuffer = new StringBuilder();
var transcriptReady = new TaskCompletionSource<string>();
connection.TranscriptReceived += (sender, transcript) =>
{
if (transcript.IsFinal)
{
var text = transcript.Channel.Alternatives[0].Transcript;
_logger.LogInformation("Final transcript: {Text}", text);
transcriptReady.TrySetResult(text);
}
};
connection.ConnectionError += (sender, error) =>
{
_logger.LogError("Connection error: {Error}", error.Message);
transcriptReady.TrySetException(new Exception(error.Message));
};
await connection.StartConnectionAsync();
// Stream audio in chunks
var buffer = new byte[8192];
int bytesRead;
while ((bytesRead = await audioStream.ReadAsync(buffer, 0, buffer.Length)) > 0)
{
await connection.SendAsync(buffer.Take(bytesRead).ToArray());
}
await connection.FinishAsync();
var result = await transcriptReady.Task;
yield return result;
}
}
AssemblyAI Implementation
AssemblyAI offers advanced features like speaker diarization and sentiment analysis.
Setup
dotnet add package AssemblyAI
Implementation
using AssemblyAI;
using AssemblyAI.Transcripts;
public class AssemblyAISTTProvider : ISTTProvider
{
private readonly AssemblyAIClient _client;
private readonly ILogger<AssemblyAISTTProvider> _logger;
public AssemblyAISTTProvider(
IConfiguration configuration,
ILogger<AssemblyAISTTProvider> logger)
{
_client = new AssemblyAIClient(configuration["AssemblyAI:ApiKey"]);
_logger = logger;
}
public async Task<string> TranscribeAsync(Stream audioStream)
{
// Upload audio
var uploadUrl = await _client.Files.UploadAsync(audioStream);
// Create transcript
var transcript = await _client.Transcripts.TranscribeAsync(new TranscriptParams
{
AudioUrl = uploadUrl,
LanguageCode = TranscriptLanguageCode.En,
PunctuationEnabled = true,
FormatTextEnabled = true,
SpeakerLabelsEnabled = true // Diarization
});
// Wait for completion
transcript = await _client.Transcripts.WaitUntilReadyAsync(transcript.Id);
return transcript.Text;
}
public async IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream)
{
using var realtime = _client.Realtime.CreateTranscriber();
var transcripts = new List<string>();
realtime.SessionBegins += (sender, e) =>
{
_logger.LogInformation("Session started: {SessionId}", e.SessionId);
};
realtime.PartialTranscriptReceived += (sender, transcript) =>
{
_logger.LogDebug("Partial: {Text}", transcript.Text);
};
realtime.FinalTranscriptReceived += (sender, transcript) =>
{
_logger.LogInformation("Final: {Text}", transcript.Text);
transcripts.Add(transcript.Text);
};
realtime.ErrorReceived += (sender, error) =>
{
_logger.LogError("Error: {Error}", error.Error);
};
await realtime.ConnectAsync();
// Stream audio
var buffer = new byte[8192];
int bytesRead;
while ((bytesRead = await audioStream.ReadAsync(buffer, 0, buffer.Length)) > 0)
{
await realtime.SendAudioAsync(buffer.Take(bytesRead).ToArray());
}
await realtime.CloseAsync();
foreach (var transcript in transcripts)
{
yield return transcript;
}
}
}
Dependency Injection Setup
// Program.cs
builder.Services.AddSingleton<ISTTProvider, DeepgramSTTProvider>();
// OR
builder.Services.AddSingleton<ISTTProvider, AssemblyAISTTProvider>();
Usage Example
[ApiController]
[Route("api/[controller]")]
public class TranscriptionController : ControllerBase
{
private readonly ISTTProvider _stt;
public TranscriptionController(ISTTProvider stt)
{
_stt = stt;
}
[HttpPost("transcribe")]
public async Task<IActionResult> Transcribe(IFormFile audio)
{
using var stream = audio.OpenReadStream();
var transcript = await _stt.TranscribeAsync(stream);
return Ok(new { transcript });
}
[HttpPost("stream")]
public async Task StreamTranscribe()
{
if (!HttpContext.WebSockets.IsWebSocketRequest)
return;
using var webSocket = await HttpContext.WebSockets.AcceptWebSocketAsync();
var audioStream = new MemoryStream();
// Receive audio via WebSocket
var buffer = new byte[1024 * 4];
while (webSocket.State == WebSocketState.Open)
{
var result = await webSocket.ReceiveAsync(
new ArraySegment<byte>(buffer),
CancellationToken.None);
if (result.MessageType == WebSocketMessageType.Binary)
{
await audioStream.WriteAsync(buffer, 0, result.Count);
}
else if (result.MessageType == WebSocketMessageType.Close)
{
break;
}
}
audioStream.Position = 0;
await foreach (var transcript in _stt.StreamTranscribeAsync(audioStream))
{
var response = Encoding.UTF8.GetBytes(transcript);
await webSocket.SendAsync(
new ArraySegment<byte>(response),
WebSocketMessageType.Text,
true,
CancellationToken.None);
}
}
}
Error Handling with Fallback
public class FallbackSTTProvider : ISTTProvider
{
private readonly ISTTProvider _primary;
private readonly ISTTProvider _fallback;
private readonly ILogger<FallbackSTTProvider> _logger;
public FallbackSTTProvider(
IEnumerable<ISTTProvider> providers,
ILogger<FallbackSTTProvider> logger)
{
var providerList = providers.ToList();
_primary = providerList[0];
_fallback = providerList[1];
_logger = logger;
}
public async Task<string> TranscribeAsync(Stream audioStream)
{
try
{
return await _primary.TranscribeAsync(audioStream);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Primary STT failed, using fallback");
audioStream.Position = 0; // Reset stream
return await _fallback.TranscribeAsync(audioStream);
}
}
public async IAsyncEnumerable<string> StreamTranscribeAsync(Stream audioStream)
{
IAsyncEnumerable<string> stream;
try
{
stream = _primary.StreamTranscribeAsync(audioStream);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Primary STT failed, using fallback");
audioStream.Position = 0;
stream = _fallback.StreamTranscribeAsync(audioStream);
}
await foreach (var transcript in stream)
{
yield return transcript;
}
}
}
Testing
public class STTProviderTests
{
[Fact]
public async Task Transcribe_ValidAudio_ReturnsText()
{
// Arrange
var mockConfig = new Mock<IConfiguration>();
mockConfig.Setup(x => x["Deepgram:ApiKey"]).Returns("test-key");
var provider = new DeepgramSTTProvider(
mockConfig.Object,
Mock.Of<ILogger<DeepgramSTTProvider>>());
var audioStream = File.OpenRead("test-audio.wav");
// Act
var result = await provider.TranscribeAsync(audioStream);
// Assert
Assert.NotEmpty(result);
}
}
Performance Tips
-
Use streaming for real-time:
StreamTranscribeAsyncfor live conversations -
Batch processing: Use
TranscribeAsyncfor recorded audio - Audio format: PCM16 at 16kHz for best results
- Buffer size: 8KB chunks for optimal streaming
- Endpointing: 300ms works well for natural pauses
Comparison
| Feature | Deepgram | AssemblyAI |
|---|---|---|
| Latency | 200-400ms | 300-500ms |
| Accuracy | Excellent | Excellent |
| Diarization | Optional | Built-in |
| Pricing | See here | See here |
| Best For | Speed | Features |
Related Articles
Questions? Drop them in the comments!
Top comments (0)