🧩 The problem
In this last post of the series we'll see how to automatically create the Jekyll pages for YouTube videos to mimic a mirror-like system. We already know how to download the elements using the script we finalized in the previous post.
These pages simply contain an HTML5 video embed, useful in case YouTube goes down or something happens to the channel.
⚠️ Warning
Just like the previous posts, ⚠️⚠️ before continuing, please only mirror content you have permission to... ⚠️⚠️
✅ The solution
This time we are back in the Jekyll blog repository. We also have all the content we need served via Apache:
- videos
- thumbnails
- titles
🗂️ Apache HTTPD listings
As you see, each directory name corresponds to a YouTube video ID. The standard Apache HTTP directory listing makes things really simple and linear to get the information we need. To keep things stateless and maintenance-free I can scrape this page and assume that every element we need is already present in the directories:
def filter_youtube_video_ids(tag) -> bool:
r"""A BeautifulSoup 4 compatible callback function."""
ok: bool = False
if tag and tag.string and re.match(r'[a-zA-Z0-9_-]{11}/', tag.string):
ok = True
return ok
def get_youtube_video_ids_from_apache_auto_indexing(url: str) -> list[str]:
r"""Scrape YouTube video ids from an Apache HTTPD "auto-indexing" page."""
with httpx.Client() as client:
response = client.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
new_soup = list(set(soup.find_all('a', string=filter_youtube_video_ids)))
return [tag.string[:-1] if tag.string[-1] == '/' else tag.string for tag in new_soup]
What does all this do? First of all we download the page content and load it as a BeautifulSoup instance. We then need to get all unique <a>
HTML elements having a YouTube video ID as content. We can use a callback function with a regex to filter names for this purpose. As far as I know, YouTube video IDs are strings of length 11 containing a combination of these characters:
- letters (
a-zA-Z
) - numbers (
0-9
) - dash (
-
) - underscore (
_
)
The second function returns all matching video IDs present on the Apache listings page.
🕵️♂️ Get local video IDs
The next step is to know which video pages are already present on the blog. Each markdown page file name corresponds to a video ID:
def get_youtube_video_ids_from_markdown_files(src_dir: str) -> list[str]:
r"""Get all markdown files."""
markdown_dir: pathlib.Path = pathlib.Path(src_dir)
return [f.stem for f in markdown_dir.rglob('*') if f.is_file() and f.suffix == '.md']
🔍 Filter video IDs
We now need to filter out the video IDs already present locally. The set difference operator is really useful for this.
def filter_youtube_video_ids_markdown_files(videos_baseurl: str, pages_base_directory: str) -> list[str]:
r"""Get all remote videos not available locally."""
remote_videos: list[str] = get_youtube_video_ids_from_apache_auto_indexing(videos_baseurl)
local_videos: list[str] = get_youtube_video_ids_from_markdown_files(pages_base_directory)
return list(set(remote_videos) - set(local_videos))
🥣 Prepare the content
Our pages need a title attribute. We can simply prepare all the URLs in a data structure and scrape the title files, one by one. Remember to use an HTTP session (Client
) to improve performance.
def get_remote_data(video_baseurl: str, video_ids: list[str]) -> list[dict]:
r"""Get the data structure"""
title_files: list[dict] = [
{
'id': v,
'title_url': urljoin(urljoin(video_baseurl, v) + '/', 'title.txt'),
'video_url': urljoin(urljoin(video_baseurl, v) + '/', v + '.webm'),
'thumbnail_url': urljoin(urljoin(video_baseurl, v) + '/', v + '.png'),
} for v in video_ids
]
with httpx.Client() as client:
for t in title_files:
t['title'] = client.get(t['title_url']).text.strip()
return title_files
📽️ Video pages
Creating a page is easy once you have the data. f-strings come in handy:
def create_markdown_page(data: dict):
return f"""---
layout: youtube_mirror
title: "|-"
{data["title"]}
permalink: /courses/solvecomputerscience-youtube-channel/{data["id"]}/
description: "|-"
{data["title"]}
enable_markdown: true
lang: 'en'
backup_url: ! '{data["video_url"]}'
poster_image: ! '{data["thumbnail_url"]}'
is_youtube_mirror: true
canonical_url: ! 'https://www.youtube.com/watch?v={data["id"]}'
---
"""
def create_markdown_pages(pages_base_directory: str, data: list[dict]):
for d in data:
page: str = create_markdown_page(d)
filename: str = ''.join([d['id'], '.md'])
pathlib.Path(pages_base_directory, filename).write_text(page)
Remember from an earlier post in this series that these pages only need some YAML front matter.
👨💻 pre-commit hook
Finally, just like the other script, we need to set up the local pre-commit hook with the Python dependencies as requirements.
- repo: local
hooks:
- id: generate_youtube_mirror_pages
name: generate_youtube_mirror_pages
entry: python ./.scripts/youtube/generate_youtube_video_pages.py
verbose: true
always_run: true
pass_filenames: false
language: python
types: [python]
additional_dependencies: ['httpx>=0,<1', 'beautifulsoup4>=4.13,<4.14']
🎉 Conclusion
Click to open the full script
import re
import pathlib
import itertools
import time
import datetime
from urllib.parse import urlparse, parse_qs, urljoin
import httpx
from bs4 import BeautifulSoup
VIDEOS_BASEURL: str = 'https://your.apache.directory/listing/page'
PAGES_BASE_DIRECTORY: str = '_courses/solve-computer-science-youtube-channel'
def filter_youtube_video_ids(tag) -> bool:
r"""A BeautifulSoup 4 compatible callback function."""
ok: bool = False
if tag and tag.string and re.match(r'[a-zA-Z0-9_-]{11}/', tag.string):
ok = True
return ok
def get_youtube_video_ids_from_apache_auto_indexing(url: str) -> list[str]:
r"""Scrape YouTube video ids from an Apache HTTPD "auto-indexing" page."""
with httpx.Client() as client:
response = client.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
new_soup = list(set(soup.find_all('a', string=filter_youtube_video_ids)))
return [tag.string[:-1] if tag.string[-1] == '/' else tag.string for tag in new_soup]
def get_youtube_video_ids_from_markdown_files(src_dir: str) -> list[str]:
r"""Get all markdown files."""
markdown_dir: pathlib.Path = pathlib.Path(src_dir)
return [f.stem for f in markdown_dir.rglob('*') if f.is_file() and f.suffix == '.md']
def filter_youtube_video_ids_markdown_files(videos_baseurl: str, pages_base_directory: str) -> list[str]:
r"""Get all remote videos not available locally."""
remote_videos: list[str] = get_youtube_video_ids_from_apache_auto_indexing(videos_baseurl)
local_videos: list[str] = get_youtube_video_ids_from_markdown_files(pages_base_directory)
return list(set(remote_videos) - set(local_videos))
def get_remote_data(video_baseurl: str, video_ids: list[str]) -> list[dict]:
r"""Get the data structure"""
title_files: list[dict] = [
{
'id': v,
'title_url': urljoin(urljoin(video_baseurl, v) + '/', 'title.txt'),
'video_url': urljoin(urljoin(video_baseurl, v) + '/', v + '.webm'),
'thumbnail_url': urljoin(urljoin(video_baseurl, v) + '/', v + '.png'),
} for v in video_ids
]
with httpx.Client() as client:
for t in title_files:
t['title'] = client.get(t['title_url']).text.strip()
return title_files
def create_markdown_page(data: dict):
return f"""---
layout: youtube_mirror
title: |-
{data["title"]}
permalink: /courses/solvecomputerscience-youtube-channel/{data["id"]}/
description: |-
{data["title"]}
enable_markdown: true
lang: 'en'
backup_url: ! '{data["video_url"]}'
poster_image: ! '{data["thumbnail_url"]}'
is_youtube_mirror: true
canonical_url: ! 'https://www.youtube.com/watch?v={data["id"]}'
---
"""
def create_markdown_pages(pages_base_directory: str, data: list[dict]):
for d in data:
page: str = create_markdown_page(d)
filename: str = ''.join([d['id'], '.md'])
pathlib.Path(pages_base_directory, filename).write_text(page)
if __name__ == '__main__':
video_ids: list[str] = filter_youtube_video_ids_markdown_files(VIDEOS_BASEURL, PAGES_BASE_DIRECTORY)
data: list[dict] = get_remote_data(VIDEOS_BASEURL, video_ids)
create_markdown_pages(PAGES_BASE_DIRECTORY, data)
Let me know if you found this series of post useful! You can comment here and check my YouTube channel.
Top comments (0)