Kyle White

Posted on Apr 2

Auto-Caption Generation: Whisper + FFmpeg in a Node.js Worker

#node #ffmpeg #ai #javascript

Captions are no longer optional for short-form video. Studies consistently show 85%+ of social media videos are watched without sound. If your pipeline produces clips without captions, you're shipping an inferior product. This post covers the full implementation: audio extraction, Whisper transcription, timing alignment, and burning captions directly into the video with FFmpeg.

This is part of the caption stack used by ClipSpeedAI.

The Approach: Hardcoded vs. Soft Captions

Two options exist:

Soft captions (SRT/VTT): Separate subtitle track, viewer can toggle. Doesn't work reliably on Instagram or TikTok uploads.
Hardcoded captions (burned in): Rendered into the video pixels permanently. Works everywhere.

For YouTube Shorts, TikTok, and Instagram Reels uploads, hardcoded captions are the only reliable option. That's what we're building.

Step 1: Extract Audio From the Clip

// extract_audio.js
import { execa } from 'execa';
import path from 'path';

export async function extractAudio(videoPath) {
  const audioPath = videoPath.replace(/\.(mp4|mov|webm)$/, '_audio.mp3');

  await execa('ffmpeg', [
    '-i', videoPath,
    '-vn',                    // no video
    '-acodec', 'mp3',
    '-ar', '16000',           // Whisper prefers 16kHz
    '-ac', '1',               // mono
    '-b:a', '64k',
    '-y',
    audioPath
  ]);

  return audioPath;
}

Downsampling to 16kHz mono before sending to Whisper reduces file size (faster upload) and matches Whisper's internal sample rate, so there's no quality loss.

Step 2: Transcribe With Whisper API

// transcribe.js
import OpenAI from 'openai';
import fs from 'fs';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

export async function transcribeAudio(audioPath) {
  const response = await openai.audio.transcriptions.create({
    file: fs.createReadStream(audioPath),
    model: 'whisper-1',
    response_format: 'verbose_json',
    timestamp_granularities: ['word']  // word-level timestamps
  });

  return response.words; // [{ word, start, end }]
}

Word-level granularity is key here. Sentence-level timestamps give you segments that are too long for caption display — you'd be showing 15 words at a time, which is unreadable at normal viewing speed. Word-level lets you group into natural 3-6 word display chunks.

Step 3: Build Caption Chunks

// caption_chunker.js
export function buildCaptionChunks(words, maxWords = 5, maxDuration = 2.5) {
  const chunks = [];
  let current = [];
  let chunkStart = null;

  for (const word of words) {
    if (current.length === 0) {
      chunkStart = word.start;
    }

    current.push(word.word);
    const chunkDuration = word.end - chunkStart;

    if (current.length >= maxWords || chunkDuration >= maxDuration) {
      chunks.push({
        text: current.join(' ').toUpperCase(), // caps look better in short-form
        start: chunkStart,
        end: word.end
      });
      current = [];
      chunkStart = null;
    }
  }

  // Flush remaining words
  if (current.length > 0) {
    chunks.push({
      text: current.join(' ').toUpperCase(),
      start: chunkStart,
      end: words[words.length - 1].end
    });
  }

  return chunks;
}

The maxWords = 5 and maxDuration = 2.5 values were tuned empirically across hundreds of clips. 5 words at ~24pt font fills about 80% of the width on a 1080px-wide vertical frame.

Step 4: Write an ASS Subtitle File

FFmpeg's drawtext filter works for simple captions, but ASS (Advanced SubStation Alpha) format gives you far better control over fonts, sizing, positioning, and per-word highlighting effects.

// ass_writer.js
import fs from 'fs';

export function writeAssFile(chunks, outputPath, videoWidth = 1080, videoHeight = 1920) {
  const header = `[Script Info]
ScriptType: v4.00+
PlayResX: ${videoWidth}
PlayResY: ${videoHeight}
ScaledBorderAndShadow: yes

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,${Math.floor(videoWidth * 0.055)},&H00FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,1,2,10,10,${Math.floor(videoHeight * 0.1)},1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
`;

  const events = chunks.map(chunk => {
    const start = formatAssTime(chunk.start);
    const end = formatAssTime(chunk.end);
    return `Dialogue: 0,${start},${end},Default,,0,0,0,,{\\an5}${chunk.text}`;
  }).join('\n');

  fs.writeFileSync(outputPath, header + events);
}

function formatAssTime(seconds) {
  const h = Math.floor(seconds / 3600);
  const m = Math.floor((seconds % 3600) / 60);
  const s = Math.floor(seconds % 60);
  const cs = Math.floor((seconds % 1) * 100);
  return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}.${String(cs).padStart(2, '0')}`;
}

The {\\an5} tag centers the subtitle both horizontally and vertically — relative to the MarginV from the bottom. For vertical video, positioning captions in the lower third works best.

Step 5: Burn Captions With FFmpeg

// caption_burner.js
import { execa } from 'execa';

export async function burnCaptions(videoPath, assPath, outputPath) {
  await execa('ffmpeg', [
    '-i', videoPath,
    '-vf', `ass=${assPath}`,
    '-c:v', 'libx264',
    '-preset', 'fast',
    '-crf', '22',
    '-c:a', 'copy',       // don't re-encode audio
    '-y',
    outputPath
  ]);
}

-c:a copy is important here — you've already encoded the audio once during the clip extraction step. Re-encoding it again introduces generational loss. Copy the audio stream directly.

Full Worker Integration

// caption_worker.js
import { extractAudio } from './extract_audio.js';
import { transcribeAudio } from './transcribe.js';
import { buildCaptionChunks } from './caption_chunker.js';
import { writeAssFile } from './ass_writer.js';
import { burnCaptions } from './caption_burner.js';
import fs from 'fs';
import path from 'path';

export async function addCaptionsToClip(clipPath) {
  const tmp = path.dirname(clipPath);
  const base = path.basename(clipPath, '.mp4');
  const audioPath = path.join(tmp, `${base}_audio.mp3`);
  const assPath = path.join(tmp, `${base}.ass`);
  const outputPath = path.join(tmp, `${base}_captioned.mp4`);

  try {
    const audioFile = await extractAudio(clipPath);
    const words = await transcribeAudio(audioFile);
    const chunks = buildCaptionChunks(words);
    writeAssFile(chunks, assPath);
    await burnCaptions(clipPath, assPath, outputPath);
    return outputPath;
  } finally {
    // Cleanup temp files
    [audioPath, assPath].forEach(f => {
      try { fs.unlinkSync(f); } catch {}
    });
  }
}

Performance Notes

Audio extraction takes ~2-5 seconds for a 60-second clip
Whisper API latency is typically 4-10 seconds for a 60-second audio file
Caption burning adds ~15-30 seconds to the FFmpeg encode (font rendering overhead)
Total caption pipeline: ~30-45 seconds per clip

For ClipSpeedAI, caption generation runs in the encode worker alongside the crop and scale step — it's the same FFmpeg pass, so the caption burn doesn't add a separate encode cycle.

Common Pitfalls

Font not found on server: ASS subtitles fall back silently if the font isn't installed. Use fontconfig or specify a font path explicitly: ass=${assPath}:fontsdir=/usr/share/fonts
Timing drift: If you trim the clip before captioning, Whisper timestamps are relative to clip start — don't offset them, they're already correct.
Long words overflowing: Add a maxWidth check and break words that exceed it. "Entrepreneurship" in all-caps at 55px will overflow a 1080px frame.

The full caption pipeline described here is in daily production use at ClipSpeedAI, processing clips for YouTube content repurposed to Shorts, Reels, and TikTok.

DEV Community