Video Content Search with Captions

#django #opensearch #python #webdev

Video Content Search with Captions

Problem

Search video content and jump directly to where it's mentioned.

Solution

Search query matches against video title, description, and captions.
When captions match, jump to caption timestamp. Otherwise, jump to video start.

Built with Django and OpenSearch.

Implementation

Step 1: Define Models

Models for storing media and subtitles.

class Media(Model):
    title = CharField(_("Title"), max_length=255)
    description = TextField(_("Description"), blank=True, default="")

class Subtitle(Model):
    media = ForeignKey(Media, CASCADE, verbose_name=_("Media"))
    lang = CharField(_("Language"), max_length=10)
    body = TextField(_("Body"))  # WebVTT format

Step 2: OpenSearch Document Indexing

Automatically index each caption line with timestamp when saving subtitles.
Use NestedField to store time and line together.

@registry.register_document
class SubtitleDocument(Document):
    media_id = fields.KeywordField()
    lang = fields.KeywordField(index=False)
    body = fields.NestedField(
        properties={
            "start": fields.KeywordField(index=False),
            "line": fields.TextField(analyzer=settings.OPENSEARCH_TEXT_ANALYZER),
        }
    )
    suggest = fields.CompletionField(analyzer="keyword", search_analyzer="keyword")

    class Index:
        name = "content_subtitle"
        settings = settings.OPENSEARCH_DSL_SETTINGS

    class Django:
        model = Subtitle

    def prepare_body(self, instance: Subtitle):
        parsed_subtitles = self.split_webvtt(instance.body)
        return [{"start": parsed["start"], "line": parsed["line"]} for parsed in parsed_subtitles]

    @staticmethod
    def split_webvtt(body: str):
        if not (body := body.strip()):
            return []
        f = StringIO(body)
        captions = webvtt.from_buffer(f)
        return [{"start": c.start, "end": c.end, "line": c.text.replace("\n", " ")} for c in captions]

Key ideas:

Parse subtitle file and index each line individually
Store start time with line (index=False to exclude from search)
Use NestedField to manage line and timestamp together
Use WebVTT format

Step 3: Search Logic

Search both metadata and captions simultaneously.

def document_search(*, q: str, page: int, size: int) -> SearchResultDict:
    offset = (page - 1) * size

    # Search media metadata (title, description)
    media_search = MediaDocument.search()
    media_search = media_search.query("multi_match", query=q, fields=["title^2", "description"])
    media_search = media_search[offset : offset + size]
    media_response = media_search.execute()

    # Search captions
    subtitle_search = SubtitleDocument.search()
    subtitle_search = subtitle_search.query(
        "nested",
        path="body",
        query=Q("match", body__line=q),
        inner_hits={"sort": [{"body.start": {"order": "asc"}}], "size": 6},  # Up to 6 matched lines
    )
    subtitle_search = subtitle_search[offset : offset + size]
    subtitle_response = subtitle_search.execute()

    # Merge results
    lines: OrderedDict[str, list[MatchedLineDict] | None] = OrderedDict()

    for hit in media_response:
        if hit.media_id:
            lines[hit.media_id] = None  # Metadata match

    for hit in subtitle_response:
        if not hit.media_id:
            continue
        matched_lines: list[MatchedLineDict] = []
        if hasattr(hit.meta, "inner_hits") and "body" in hit.meta.inner_hits:
            for inner_hit in hit.meta.inner_hits.body:
                matched_lines.append({"start": inner_hit.start, "line": inner_hit.line})
        lines[hit.media_id] = matched_lines  # Caption match

    lines = OrderedDict(list(lines.items())[:size])
    total_count = len(lines)

    return SearchResultDict(lines=lines, count=total_count, pages=math.ceil(total_count / size))


class Media(Model):
    # ...

    @classmethod
    async def search(cls, *, q: str, page: int, size: int, filter: Literal["public", "all"]):
        from apps.content.documents import document_search

        qs = cls.annotate_accessible().select_related("owner")
        if filter == "public":
            qs = qs.filter(publicaccessmedia__start__lte=timezone.now(), publicaccessmedia__archive__gte=timezone.now())

        if not q:
            searched = None
            paginated = await offset_paginate(qs, page=page, size=size)
        else:
            # document search
            searched = await sync_to_async(document_search)(q=q, page=page, size=size)
            paginated: dict = {
                "items": [m async for m in qs.filter(id__in=searched["lines"].keys())],
                "count": searched["count"],
                "size": size,
                "page": page,
                "pages": searched["pages"],
            }

        for media in paginated["items"]:
            media.matched_lines = searched["lines"][media.pk] if searched else None

        return paginated

Search strategy:

Search title/description (2x weight on title)
Search caption lines (nested query)
Retrieve up to 6 matched lines using inner_hits
Merge results by media_id

Step 4: API

class SearchedMediaSchema(LearningObjectMixinSchema):
    class MatchedLineSchema(Schema):
        start: str
        line: str

    id: str
    thumbnail: str
    duration_seconds: float
    owner: OwnerSchema
    format: Media.MediaFormatChoices
    uploaded: bool
    url: str

    accessible: bool
    matched_lines: list[MatchedLineSchema] | None


@router.get("/search", response=PaginatedResponse[SearchedMediaSchema])
async def search(
    request: HttpRequest,
    page: Annotated[int, functions.Query(1, ge=1)],
    size: Annotated[int, functions.Query(settings.DEFAULT_PAGINATION_SIZE, gte=1, le=100)],
    q: str = "",
    filter: Literal["public", "all"] = "public",
):
    return await Media.search(q=q, page=page, size=size, filter=filter)