Building a Video Thumbnail Generator With FFmpeg and Node.js

#node #ffmpeg #javascript #video

Thumbnails drive click-through rates on YouTube, and programmatic thumbnail generation is a solved problem with FFmpeg — but the implementation details matter enormously. Extracting the wrong frame (a blink, a dark transition, a blurry motion shot) is worse than a static placeholder. This post covers smart frame selection, multi-frame extraction, quality scoring, and the full Node.js implementation.

This thumbnail pipeline is part of the processing stack at ClipSpeedAI, where thumbnails are generated for every processed clip.

Basic Frame Extraction

The simplest approach: extract a frame at a specific timestamp.

// lib/thumbnails/basic.js
import { execa } from 'execa';

export async function extractFrameAtTime(videoPath, outputPath, timestamp) {
  await execa('ffmpeg', [
    '-ss', String(timestamp),
    '-i', videoPath,
    '-frames:v', '1',
    '-q:v', '2',          // JPEG quality (2 = high, 31 = low)
    '-y',
    outputPath
  ]);

  return outputPath;
}

The -ss flag before -i (input seeking) is critical. Placing -ss before the input file makes FFmpeg seek to the timestamp without decoding all prior frames — this is 10-50x faster for long videos.

Multi-Frame Candidate Extraction

Better approach: extract multiple candidate frames spread across the video and score them.

// lib/thumbnails/candidates.js
import { execa } from 'execa';
import path from 'path';
import { promisify } from 'util';
import ffmpeg from 'fluent-ffmpeg';

const ffprobe = promisify(ffmpeg.ffprobe.bind(ffmpeg));

export async function extractCandidateFrames(videoPath, outputDir, count = 10) {
  const probe = await ffprobe(videoPath);
  const duration = probe.format.duration;

  // Avoid first and last 5% (likely intro/outro)
  const safeStart = duration * 0.05;
  const safeEnd = duration * 0.95;
  const interval = (safeEnd - safeStart) / count;

  const frames = [];

  for (let i = 0; i < count; i++) {
    const timestamp = safeStart + i * interval;
    const outputPath = path.join(outputDir, `thumb_candidate_${i}.jpg`);

    await execa('ffmpeg', [
      '-ss', timestamp.toFixed(2),
      '-i', videoPath,
      '-frames:v', '1',
      '-q:v', '2',
      '-y',
      outputPath
    ]);

    frames.push({ timestamp, path: outputPath, index: i });
  }

  return frames;
}

Scene Change Detection for Smart Timestamps

Rather than equally-spaced frames, use FFmpeg's scene change detection to find frames that are visually distinct (not blurry transitions):

export async function getSceneChangeTimestamps(videoPath, threshold = 0.3, maxScenes = 15) {
  const { stdout } = await execa('ffmpeg', [
    '-i', videoPath,
    '-vf', `select='gt(scene,${threshold})',showinfo`,
    '-vsync', 'vfr',
    '-f', 'null',
    '-'
  ], { all: true });

  // Parse showinfo output for pts_time values
  const timestamps = [];
  const regex = /pts_time:([\d.]+)/g;
  let match;

  while ((match = regex.exec(stdout)) !== null) {
    timestamps.push(parseFloat(match[1]));
    if (timestamps.length >= maxScenes) break;
  }

  return timestamps;
}

Scene changes are exactly what you want for thumbnails — they're moments where the visual content shifted significantly, which often correlates with interesting content.

Frame Quality Scoring

Once you have candidate frames, score them by sharpness (Laplacian variance — higher = sharper):

# python/score_frame.py
import sys
import cv2
import json
import os

def score_frame_sharpness(image_path):
    img = cv2.imread(image_path)
    if img is None:
        return 0

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()

    # Also check brightness (avoid very dark or overexposed frames)
    mean_brightness = gray.mean()
    brightness_ok = 40 < mean_brightness < 220

    return {
        'sharpness': float(laplacian_var),
        'brightness': float(mean_brightness),
        'usable': brightness_ok and laplacian_var > 50,
        'path': image_path
    }

if __name__ == '__main__':
    results = [score_frame_sharpness(p) for p in sys.argv[1:]]
    print(json.dumps(results))

// lib/thumbnails/scorer.js
import { execa } from 'execa';

export async function scoreFrames(framePaths) {
  const { stdout } = await execa('python3', [
    './python/score_frame.py',
    ...framePaths
  ]);

  return JSON.parse(stdout);
}

The Full Pipeline

// lib/thumbnails/generator.js
import { extractCandidateFrames, getSceneChangeTimestamps, extractFrameAtTime } from './candidates.js';
import { scoreFrames } from './scorer.js';
import fs from 'fs/promises';
import path from 'path';

export async function generateBestThumbnail(videoPath, outputDir, options = {}) {
  const {
    useSceneDetection = true,
    candidateCount = 10,
    outputWidth = 1280,
    outputHeight = 720
  } = options;

  await fs.mkdir(outputDir, { recursive: true });

  // Get candidate timestamps
  let timestamps;
  if (useSceneDetection) {
    timestamps = await getSceneChangeTimestamps(videoPath);
    if (timestamps.length < 3) {
      // Fallback to evenly-spaced if few scene changes
      timestamps = null;
    }
  }

  // Extract candidates
  const candidates = timestamps
    ? await Promise.all(
        timestamps.slice(0, candidateCount).map((ts, i) =>
          extractFrameAtTime(videoPath, path.join(outputDir, `cand_${i}.jpg`), ts)
            .then(p => ({ timestamp: ts, path: p }))
        )
      )
    : await extractCandidateFrames(videoPath, outputDir, candidateCount);

  // Score all candidates
  const scores = await scoreFrames(candidates.map(c => c.path));

  // Find best usable frame
  const usable = scores
    .filter(s => s.usable)
    .sort((a, b) => b.sharpness - a.sharpness);

  const best = usable[0] || scores.sort((a, b) => b.sharpness - a.sharpness)[0];

  // Resize to target dimensions
  const finalPath = path.join(outputDir, 'thumbnail.jpg');
  const { execa } = await import('execa');

  await execa('ffmpeg', [
    '-i', best.path,
    '-vf', `scale=${outputWidth}:${outputHeight}:force_original_aspect_ratio=decrease,pad=${outputWidth}:${outputHeight}:(ow-iw)/2:(oh-ih)/2`,
    '-q:v', '3',
    '-y',
    finalPath
  ]);

  // Cleanup candidates
  await Promise.all(
    candidates.map(c => fs.unlink(c.path).catch(() => {}))
  );

  return finalPath;
}

Generating Thumbnails for Vertical Video

For Shorts/Reels thumbnails (vertical 9:16), adjust the dimensions:

const thumbnail = await generateBestThumbnail(clipPath, tmpDir, {
  outputWidth: 1080,
  outputHeight: 1920,
  useSceneDetection: true
});

The pad filter with (ow-iw)/2:(oh-ih)/2 centers the video in the target frame with black bars rather than distorting it.

Adding Text Overlay

For branded thumbnails with title text:

export async function addTitleOverlay(imagePath, title, outputPath) {
  const escapedTitle = title.replace(/:/g, '\\:').replace(/'/g, "\\'");

  await execa('ffmpeg', [
    '-i', imagePath,
    '-vf', [
      // Semi-transparent bottom bar
      `drawbox=x=0:y=ih-120:w=iw:h=120:color=black@0.6:t=fill`,
      // Title text
      `drawtext=text='${escapedTitle}':fontsize=36:fontcolor=white:x=(w-text_w)/2:y=h-80:font=Arial`
    ].join(','),
    '-q:v', '2',
    '-y',
    outputPath
  ]);
}

Integration With the Processing Pipeline

In the clip processing worker, thumbnail generation runs in parallel with the upload step:

const [uploadedUrl, thumbnailPath] = await Promise.all([
  uploadClipToStorage(encodedClipPath),
  generateBestThumbnail(encodedClipPath, tmpDir)
]);

const thumbnailUrl = await uploadThumbnailToStorage(thumbnailPath);

ClipSpeedAI generates thumbnails for every processed clip automatically. Users can download both the clip and its thumbnail, which is useful when scheduling YouTube Shorts uploads where custom thumbnails are supported.

The key insight: "smart" thumbnail selection (scene detection + sharpness scoring) consistently produces better results than random or center-frame selection. The 15 minutes of development to add frame scoring pays off across every video processed. For a complete hosted pipeline that handles thumbnails, captions, and vertical clip generation together, see ClipSpeedAI.