How to Extract Audio From Video and Transcribe It With Whisper API

#node #ffmpeg #ai #javascript

Transcription is the gateway to understanding video content at scale. Once you have timestamped text from a video, you can score segments for virality, generate captions, search across a library of content, or build a summarization layer on top. This post covers the complete implementation: FFmpeg audio extraction, file optimization for Whisper, the API call, and timestamp processing.

This transcription pipeline powers the clip selection engine at ClipSpeedAI.

Step 1: Extract Audio With FFmpeg

Sending the full video file to the Whisper API is wasteful — you pay for processing time and upload time for video data that Whisper ignores. Extract audio first.

// lib/audio_extractor.js
import { execa } from 'execa';
import path from 'path';

export async function extractAudio(videoPath, options = {}) {
  const {
    sampleRate = 16000,    // Whisper's native sample rate
    channels = 1,          // mono — stereo adds no value for speech recognition
    bitrate = '64k',       // sufficient for speech; WAV is lossless alternative
    format = 'mp3'         // mp3 for smaller files; wav for maximum accuracy
  } = options;

  const ext = format === 'wav' ? 'wav' : 'mp3';
  const outputPath = videoPath.replace(/\.[^.]+$/, `_audio.${ext}`);

  const args = [
    '-i', videoPath,
    '-vn',                  // no video stream
    '-acodec', format === 'wav' ? 'pcm_s16le' : 'mp3',
    '-ar', String(sampleRate),
    '-ac', String(channels),
    '-y',
    outputPath
  ];

  if (format === 'mp3') {
    args.splice(-1, 0, '-b:a', bitrate);
  }

  await execa('ffmpeg', args);
  return outputPath;
}

The 16000Hz sample rate matches Whisper's internal processing rate. Sending 44.1kHz audio doesn't improve accuracy and triples the file size. For a 30-minute video:

44.1kHz stereo MP3 at 192kbps: ~40MB
16kHz mono MP3 at 64kbps: ~7MB

Step 2: File Size Check

Whisper API has a 25MB file size limit. For longer videos (60+ minutes), you may need to split the audio:

// lib/audio_splitter.js
import { execa } from 'execa';
import fs from 'fs/promises';
import path from 'path';

const WHISPER_MAX_MB = 24;  // 24 to be safe (limit is 25)
const WHISPER_MAX_BYTES = WHISPER_MAX_MB * 1024 * 1024;

export async function prepareAudioForWhisper(audioPath) {
  const stat = await fs.stat(audioPath);

  if (stat.size <= WHISPER_MAX_BYTES) {
    return [audioPath];  // single file, no split needed
  }

  // Calculate duration and chunk size
  const { stdout } = await execa('ffprobe', [
    '-v', 'quiet',
    '-print_format', 'json',
    '-show_format',
    audioPath
  ]);

  const { format } = JSON.parse(stdout);
  const totalDuration = parseFloat(format.duration);
  const chunks = Math.ceil(stat.size / WHISPER_MAX_BYTES);
  const chunkDuration = totalDuration / chunks;

  const chunkPaths = [];
  const dir = path.dirname(audioPath);
  const base = path.basename(audioPath, path.extname(audioPath));

  for (let i = 0; i < chunks; i++) {
    const start = i * chunkDuration;
    const chunkPath = path.join(dir, `${base}_chunk_${i}.mp3`);

    await execa('ffmpeg', [
      '-ss', start.toFixed(2),
      '-i', audioPath,
      '-t', chunkDuration.toFixed(2),
      '-c', 'copy',
      '-y',
      chunkPath
    ]);

    chunkPaths.push({ path: chunkPath, startOffset: start });
  }

  return chunkPaths;
}

Step 3: Transcribe With Whisper API

// lib/transcriber.js
import OpenAI from 'openai';
import fs from 'fs';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

export async function transcribeAudioFile(audioPath, options = {}) {
  const {
    language = 'en',
    granularity = 'word',   // 'word' or 'segment'
    prompt = ''              // optional context hint for proper nouns
  } = options;

  const response = await openai.audio.transcriptions.create({
    file: fs.createReadStream(audioPath),
    model: 'whisper-1',
    language,
    response_format: 'verbose_json',
    timestamp_granularities: [granularity],
    prompt  // e.g., "ClipSpeedAI, YouTube, Shorts" to help Whisper spell brand names correctly
  });

  return {
    text: response.text,
    segments: response.segments?.map(seg => ({
      start: seg.start,
      end: seg.end,
      text: seg.text.trim(),
      avgLogprob: seg.avg_logprob  // confidence indicator
    })),
    words: response.words?.map(w => ({
      start: w.start,
      end: w.end,
      word: w.word.trim()
    }))
  };
}

The prompt parameter is underused but valuable. Passing a comma-separated list of technical terms, brand names, or proper nouns helps Whisper spell them correctly rather than phonetically guessing.

Step 4: Handling Multi-Chunk Transcription

When you've split audio into chunks, merge the transcription results with corrected timestamps:

// lib/transcriber.js (continued)
export async function transcribeWithChunks(audioPath) {
  const chunks = await prepareAudioForWhisper(audioPath);

  // Single file path (no split needed)
  if (typeof chunks[0] === 'string') {
    return transcribeAudioFile(chunks[0]);
  }

  // Multiple chunks — transcribe each and merge
  const results = await Promise.all(
    chunks.map(chunk => transcribeAudioFile(chunk.path)
      .then(result => ({ result, startOffset: chunk.startOffset }))
    )
  );

  // Merge with timestamp correction
  const allSegments = [];
  const allWords = [];

  for (const { result, startOffset } of results) {
    if (result.segments) {
      allSegments.push(...result.segments.map(seg => ({
        ...seg,
        start: seg.start + startOffset,
        end: seg.end + startOffset
      })));
    }

    if (result.words) {
      allWords.push(...result.words.map(w => ({
        ...w,
        start: w.start + startOffset,
        end: w.end + startOffset
      })));
    }
  }

  // Cleanup chunk files
  await Promise.all(chunks.map(c => fs.promises.unlink(c.path).catch(() => {})));

  return {
    text: results.map(r => r.result.text).join(' '),
    segments: allSegments,
    words: allWords
  };
}

Step 5: Post-Processing Transcription Results

Raw Whisper output has some quirks:

Low-confidence segments (avg_logprob < -1.0) tend to be wrong. Filter them if accuracy matters.
Segments that are just music, applause, or filler (< 0.5 words/second) can be filtered.
Word-level timestamps sometimes have slight overlaps — deduplicate on word start time.

export function cleanTranscription(transcription) {
  const { segments, words } = transcription;

  const cleanSegments = segments
    .filter(seg => {
      // Remove very low confidence segments
      if (seg.avgLogprob < -1.2) return false;

      // Remove segments with very low word density (music, silence)
      const wordCount = seg.text.split(' ').length;
      const duration = seg.end - seg.start;
      if (wordCount / duration < 0.5) return false;

      return true;
    });

  // Deduplicate words by start time
  const seenStarts = new Set();
  const cleanWords = words?.filter(w => {
    const key = w.start.toFixed(2);
    if (seenStarts.has(key)) return false;
    seenStarts.add(key);
    return true;
  });

  return { ...transcription, segments: cleanSegments, words: cleanWords };
}

Full Integration

// Putting it all together
export async function transcribeVideo(videoPath) {
  // Extract audio
  const audioPath = await extractAudio(videoPath, { sampleRate: 16000, channels: 1 });

  try {
    // Transcribe (handles chunking automatically)
    const raw = await transcribeWithChunks(audioPath);

    // Clean and return
    return cleanTranscription(raw);

  } finally {
    // Always clean up audio file
    await fs.promises.unlink(audioPath).catch(() => {});
  }
}

Caching Transcriptions

Transcription is expensive (~$0.006/minute for Whisper). Cache by video ID:

export async function transcribeVideoWithCache(videoId, videoPath) {
  const cacheKey = `transcript:v2:${videoId}`;
  const cached = await redis.get(cacheKey);

  if (cached) return JSON.parse(cached);

  const result = await transcribeVideo(videoPath);
  await redis.setex(cacheKey, 7 * 24 * 3600, JSON.stringify(result));
  return result;
}

For ClipSpeedAI, transcription caching means that if two users process the same YouTube video, Whisper is only called once. This alone cut transcription costs significantly once any video volume started building up.

The complete transcription pipeline — FFmpeg extraction, chunking, Whisper API, timestamp merging, and cleaning — is what feeds both the clip scoring system and the caption generation system described in other articles in this series. If you want this full pipeline running without building it yourself, ClipSpeedAI provides transcription-driven clip selection and caption generation as a hosted service for YouTube content.