Captions are no longer optional for short-form video. Studies consistently show 85%+ of social media videos are watched without sound. If your pipeline produces clips without captions, you're shipping an inferior product. This post covers the full implementation: audio extraction, Whisper transcription, timing alignment, and burning captions directly into the video with FFmpeg.
This is part of the caption stack used by ClipSpeedAI.
The Approach: Hardcoded vs. Soft Captions
Two options exist:
- Soft captions (SRT/VTT): Separate subtitle track, viewer can toggle. Doesn't work reliably on Instagram or TikTok uploads.
- Hardcoded captions (burned in): Rendered into the video pixels permanently. Works everywhere.
For YouTube Shorts, TikTok, and Instagram Reels uploads, hardcoded captions are the only reliable option. That's what we're building.
Step 1: Extract Audio From the Clip
// extract_audio.js
import { execa } from 'execa';
import path from 'path';
export async function extractAudio(videoPath) {
const audioPath = videoPath.replace(/\.(mp4|mov|webm)$/, '_audio.mp3');
await execa('ffmpeg', [
'-i', videoPath,
'-vn', // no video
'-acodec', 'mp3',
'-ar', '16000', // Whisper prefers 16kHz
'-ac', '1', // mono
'-b:a', '64k',
'-y',
audioPath
]);
return audioPath;
}
Downsampling to 16kHz mono before sending to Whisper reduces file size (faster upload) and matches Whisper's internal sample rate, so there's no quality loss.
Step 2: Transcribe With Whisper API
// transcribe.js
import OpenAI from 'openai';
import fs from 'fs';
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
export async function transcribeAudio(audioPath) {
const response = await openai.audio.transcriptions.create({
file: fs.createReadStream(audioPath),
model: 'whisper-1',
response_format: 'verbose_json',
timestamp_granularities: ['word'] // word-level timestamps
});
return response.words; // [{ word, start, end }]
}
Word-level granularity is key here. Sentence-level timestamps give you segments that are too long for caption display — you'd be showing 15 words at a time, which is unreadable at normal viewing speed. Word-level lets you group into natural 3-6 word display chunks.
Step 3: Build Caption Chunks
// caption_chunker.js
export function buildCaptionChunks(words, maxWords = 5, maxDuration = 2.5) {
const chunks = [];
let current = [];
let chunkStart = null;
for (const word of words) {
if (current.length === 0) {
chunkStart = word.start;
}
current.push(word.word);
const chunkDuration = word.end - chunkStart;
if (current.length >= maxWords || chunkDuration >= maxDuration) {
chunks.push({
text: current.join(' ').toUpperCase(), // caps look better in short-form
start: chunkStart,
end: word.end
});
current = [];
chunkStart = null;
}
}
// Flush remaining words
if (current.length > 0) {
chunks.push({
text: current.join(' ').toUpperCase(),
start: chunkStart,
end: words[words.length - 1].end
});
}
return chunks;
}
The maxWords = 5 and maxDuration = 2.5 values were tuned empirically across hundreds of clips. 5 words at ~24pt font fills about 80% of the width on a 1080px-wide vertical frame.
Step 4: Write an ASS Subtitle File
FFmpeg's drawtext filter works for simple captions, but ASS (Advanced SubStation Alpha) format gives you far better control over fonts, sizing, positioning, and per-word highlighting effects.
// ass_writer.js
import fs from 'fs';
export function writeAssFile(chunks, outputPath, videoWidth = 1080, videoHeight = 1920) {
const header = `[Script Info]
ScriptType: v4.00+
PlayResX: ${videoWidth}
PlayResY: ${videoHeight}
ScaledBorderAndShadow: yes
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,${Math.floor(videoWidth * 0.055)},&H00FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,1,2,10,10,${Math.floor(videoHeight * 0.1)},1
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
`;
const events = chunks.map(chunk => {
const start = formatAssTime(chunk.start);
const end = formatAssTime(chunk.end);
return `Dialogue: 0,${start},${end},Default,,0,0,0,,{\\an5}${chunk.text}`;
}).join('\n');
fs.writeFileSync(outputPath, header + events);
}
function formatAssTime(seconds) {
const h = Math.floor(seconds / 3600);
const m = Math.floor((seconds % 3600) / 60);
const s = Math.floor(seconds % 60);
const cs = Math.floor((seconds % 1) * 100);
return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}.${String(cs).padStart(2, '0')}`;
}
The {\\an5} tag centers the subtitle both horizontally and vertically — relative to the MarginV from the bottom. For vertical video, positioning captions in the lower third works best.
Step 5: Burn Captions With FFmpeg
// caption_burner.js
import { execa } from 'execa';
export async function burnCaptions(videoPath, assPath, outputPath) {
await execa('ffmpeg', [
'-i', videoPath,
'-vf', `ass=${assPath}`,
'-c:v', 'libx264',
'-preset', 'fast',
'-crf', '22',
'-c:a', 'copy', // don't re-encode audio
'-y',
outputPath
]);
}
-c:a copy is important here — you've already encoded the audio once during the clip extraction step. Re-encoding it again introduces generational loss. Copy the audio stream directly.
Full Worker Integration
// caption_worker.js
import { extractAudio } from './extract_audio.js';
import { transcribeAudio } from './transcribe.js';
import { buildCaptionChunks } from './caption_chunker.js';
import { writeAssFile } from './ass_writer.js';
import { burnCaptions } from './caption_burner.js';
import fs from 'fs';
import path from 'path';
export async function addCaptionsToClip(clipPath) {
const tmp = path.dirname(clipPath);
const base = path.basename(clipPath, '.mp4');
const audioPath = path.join(tmp, `${base}_audio.mp3`);
const assPath = path.join(tmp, `${base}.ass`);
const outputPath = path.join(tmp, `${base}_captioned.mp4`);
try {
const audioFile = await extractAudio(clipPath);
const words = await transcribeAudio(audioFile);
const chunks = buildCaptionChunks(words);
writeAssFile(chunks, assPath);
await burnCaptions(clipPath, assPath, outputPath);
return outputPath;
} finally {
// Cleanup temp files
[audioPath, assPath].forEach(f => {
try { fs.unlinkSync(f); } catch {}
});
}
}
Performance Notes
- Audio extraction takes ~2-5 seconds for a 60-second clip
- Whisper API latency is typically 4-10 seconds for a 60-second audio file
- Caption burning adds ~15-30 seconds to the FFmpeg encode (font rendering overhead)
- Total caption pipeline: ~30-45 seconds per clip
For ClipSpeedAI, caption generation runs in the encode worker alongside the crop and scale step — it's the same FFmpeg pass, so the caption burn doesn't add a separate encode cycle.
Common Pitfalls
-
Font not found on server: ASS subtitles fall back silently if the font isn't installed. Use
fontconfigor specify a font path explicitly:ass=${assPath}:fontsdir=/usr/share/fonts - Timing drift: If you trim the clip before captioning, Whisper timestamps are relative to clip start — don't offset them, they're already correct.
-
Long words overflowing: Add a
maxWidthcheck and break words that exceed it. "Entrepreneurship" in all-caps at 55px will overflow a 1080px frame.
The full caption pipeline described here is in daily production use at ClipSpeedAI, processing clips for YouTube content repurposed to Shorts, Reels, and TikTok.
Top comments (0)