Video Content Search with Captions
Problem
Search video content and jump directly to where it's mentioned.
Solution
Search query matches against video title, description, and captions.
When captions match, jump to caption timestamp. Otherwise, jump to video start.
Built with Django and OpenSearch.
Implementation
Step 1: Define Models
Models for storing media and subtitles.
class Media(Model):
title = CharField(_("Title"), max_length=255)
description = TextField(_("Description"), blank=True, default="")
class Subtitle(Model):
media = ForeignKey(Media, CASCADE, verbose_name=_("Media"))
lang = CharField(_("Language"), max_length=10)
body = TextField(_("Body")) # WebVTT format
Step 2: OpenSearch Document Indexing
Automatically index each caption line with timestamp when saving subtitles.
Use NestedField to store time and line together.
@registry.register_document
class SubtitleDocument(Document):
media_id = fields.KeywordField()
lang = fields.KeywordField(index=False)
body = fields.NestedField(
properties={
"start": fields.KeywordField(index=False),
"line": fields.TextField(analyzer=settings.OPENSEARCH_TEXT_ANALYZER),
}
)
suggest = fields.CompletionField(analyzer="keyword", search_analyzer="keyword")
class Index:
name = "content_subtitle"
settings = settings.OPENSEARCH_DSL_SETTINGS
class Django:
model = Subtitle
def prepare_body(self, instance: Subtitle):
parsed_subtitles = self.split_webvtt(instance.body)
return [{"start": parsed["start"], "line": parsed["line"]} for parsed in parsed_subtitles]
@staticmethod
def split_webvtt(body: str):
if not (body := body.strip()):
return []
f = StringIO(body)
captions = webvtt.from_buffer(f)
return [{"start": c.start, "end": c.end, "line": c.text.replace("\n", " ")} for c in captions]
Key ideas:
- Parse subtitle file and index each line individually
- Store start time with line (index=False to exclude from search)
- Use NestedField to manage line and timestamp together
- Use WebVTT format
Step 3: Search Logic
Search both metadata and captions simultaneously.
def document_search(*, q: str, page: int, size: int) -> SearchResultDict:
offset = (page - 1) * size
# Search media metadata (title, description)
media_search = MediaDocument.search()
media_search = media_search.query("multi_match", query=q, fields=["title^2", "description"])
media_search = media_search[offset : offset + size]
media_response = media_search.execute()
# Search captions
subtitle_search = SubtitleDocument.search()
subtitle_search = subtitle_search.query(
"nested",
path="body",
query=Q("match", body__line=q),
inner_hits={"sort": [{"body.start": {"order": "asc"}}], "size": 6}, # Up to 6 matched lines
)
subtitle_search = subtitle_search[offset : offset + size]
subtitle_response = subtitle_search.execute()
# Merge results
lines: OrderedDict[str, list[MatchedLineDict] | None] = OrderedDict()
for hit in media_response:
if hit.media_id:
lines[hit.media_id] = None # Metadata match
for hit in subtitle_response:
if not hit.media_id:
continue
matched_lines: list[MatchedLineDict] = []
if hasattr(hit.meta, "inner_hits") and "body" in hit.meta.inner_hits:
for inner_hit in hit.meta.inner_hits.body:
matched_lines.append({"start": inner_hit.start, "line": inner_hit.line})
lines[hit.media_id] = matched_lines # Caption match
lines = OrderedDict(list(lines.items())[:size])
total_count = len(lines)
return SearchResultDict(lines=lines, count=total_count, pages=math.ceil(total_count / size))
class Media(Model):
# ...
@classmethod
async def search(cls, *, q: str, page: int, size: int, filter: Literal["public", "all"]):
from apps.content.documents import document_search
qs = cls.annotate_accessible().select_related("owner")
if filter == "public":
qs = qs.filter(publicaccessmedia__start__lte=timezone.now(), publicaccessmedia__archive__gte=timezone.now())
if not q:
searched = None
paginated = await offset_paginate(qs, page=page, size=size)
else:
# document search
searched = await sync_to_async(document_search)(q=q, page=page, size=size)
paginated: dict = {
"items": [m async for m in qs.filter(id__in=searched["lines"].keys())],
"count": searched["count"],
"size": size,
"page": page,
"pages": searched["pages"],
}
for media in paginated["items"]:
media.matched_lines = searched["lines"][media.pk] if searched else None
return paginated
Search strategy:
- Search title/description (2x weight on title)
- Search caption lines (nested query)
- Retrieve up to 6 matched lines using inner_hits
- Merge results by media_id
Step 4: API
class SearchedMediaSchema(LearningObjectMixinSchema):
class MatchedLineSchema(Schema):
start: str
line: str
id: str
thumbnail: str
duration_seconds: float
owner: OwnerSchema
format: Media.MediaFormatChoices
uploaded: bool
url: str
accessible: bool
matched_lines: list[MatchedLineSchema] | None
@router.get("/search", response=PaginatedResponse[SearchedMediaSchema])
async def search(
request: HttpRequest,
page: Annotated[int, functions.Query(1, ge=1)],
size: Annotated[int, functions.Query(settings.DEFAULT_PAGINATION_SIZE, gte=1, le=100)],
q: str = "",
filter: Literal["public", "all"] = "public",
):
return await Media.search(q=q, page=page, size=size, filter=filter)

Top comments (0)