From 2894cdd8c8305b05e1d5ea37dd48d1780629d385 Mon Sep 17 00:00:00 2001 From: Barbagus Date: Tue, 13 Dec 2022 13:28:28 +0100 Subject: [PATCH 1/2] Fix HLS protocol terminology in the code #12 - versions => renditions - resolutions => variants - ranges and/or chunks => segments - version index => master playlist - other index => media playlist url For now, the CLI has not been updated with this terminology, only the code. --- README.md | 86 +++++++---------- src/delarte/__main__.py | 37 ++++---- src/delarte/api.py | 14 +-- src/delarte/hls.py | 198 +++++++++++++++++++++++++++++----------- 4 files changed, 200 insertions(+), 135 deletions(-) diff --git a/README.md b/README.md index f46e239..54c8d60 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ That _program identifier_ enables us to query an API for the program's informati ##### The _config_ API -For the last exemple the API call is as such: +For the last example the API call is as such: ``` https://api.arte.tv/api/player/v2/config/en/104001-000-A @@ -168,11 +168,11 @@ The response is a JSON object: } } ``` -Information about the program is detailed in `data.attributes.metadata` and a list of available audio/subtitles combinations in `data.attributes.streams`. In our code such a combination is refered to as a _version_. +Information about the program is detailed in `data.attributes.metadata` and a list of available audio/subtitles combinations in `data.attributes.streams`. In our code such a combination is refered to as a _rendition_ (or _version_ in the CLI). -Every such _version_ has a reference to a _version index_ file in `.streams[i].url` and description of the audio/subtitle combination in `.streams[i].versions[0]`. +Every such _rendition_ has a reference to a _master playlist_ file in `.streams[i].url` and description of the audio/subtitle combination in `.streams[i].versions[0]`. -We are using `.streams[i].versions[0].eStat.ml5` as our _version codes_: +We are using `.streams[i].versions[0].eStat.ml5` as our _rendition_ key: - `VOF-STE[ANG]` English (Subtitles) - `VOF-STF` French (Original) @@ -181,9 +181,9 @@ We are using `.streams[i].versions[0].eStat.ml5` as our _version codes_: - `VA-STMA` German closed captioning - ... -##### The _version index_ file +#### The _master playlist_ -The file is in [HTTP Livestreaming](https://www.rfc-editor.org/rfc/rfc8216) `.m3u8` format: +As defined in [HTTP Live Streaming](https://www.rfc-editor.org/rfc/rfc8216), for example: ``` #EXTM3U @@ -204,13 +204,14 @@ medias/104001-000-A_v216.m3u8 ... ``` -This can be parsed with the [m3u8](https://pypi.org/project/m3u8/) library. +This file show the a list of video _variants_ URIs (one per video resolution). Each of them has +- exactly one video _media playlist_ reference +- exactly one audio _media playlist_ reference +- at most one subtitles _media playlist_ reference -This file show the a list of _video index_ URIs (one per video resolution). Each of them is linked to exactly one _audio index_ file and at most one _subtitiles index_ file. +##### The video and audio _media playlist_ -##### The _video index_ files - -The file is also in [HTTP Livestreaming](https://www.rfc-editor.org/rfc/rfc8216) `.m3u8` format: +As defined in [HTTP Live Streaming](https://www.rfc-editor.org/rfc/rfc8216), for example: ``` #EXTM3U @@ -235,38 +236,12 @@ The file is also in [HTTP Livestreaming](https://www.rfc-editor.org/rfc/rfc8216) ... ``` -This file shows the list of _video chuncks_ the server expect to serve. +This file shows the list of _segments_ the server expect to serve. -##### The _audio index_ file -Similarly to the _video index_ file it shows the list of _audio chuncks_ the server expect to serve: +##### The subtitles _media playlist_ -``` -#EXTM3U -#EXT-X-TARGETDURATION:6 -#EXT-X-VERSION:7 -#EXT-X-MEDIA-SEQUENCE:1 -#EXT-X-INDEPENDENT-SEGMENTS -#EXT-X-PLAYLIST-TYPE:VOD -#EXT-X-MAP:URI="104001-000-A_aud_VOF.mp4",BYTERANGE="28752@0" -#EXTINF:5.991, -#EXT-X-BYTERANGE:82445@28752 -104001-000-A_aud_VOF.mp4 -#EXTINF:5.991, -#EXT-X-BYTERANGE:99299@111197 -104001-000-A_aud_VOF.mp4 -#EXTINF:5.991, -#EXT-X-BYTERANGE:101640@210496 -104001-000-A_aud_VOF.mp4 -#EXTINF:5.991, -#EXT-X-BYTERANGE:102047@312136 -104001-000-A_aud_VOF.mp4 -... -``` - -##### The _subtitles index_ file - -The file is also in [HTTP Livestreaming](https://www.rfc-editor.org/rfc/rfc8216) `.m3u8` format: +As defined in [HTTP Live Streaming](https://www.rfc-editor.org/rfc/rfc8216), for example: ``` #EXTM3U @@ -279,38 +254,39 @@ The file is also in [HTTP Livestreaming](https://www.rfc-editor.org/rfc/rfc8216) #EXT-X-ENDLIST ``` -This file shows the file(s) containing the subtitles data. +This file shows the file containing the subtitles data. ### βš™οΈThe process -1. Get the _config_ API object for the _program identifier_ - - Figure out the _output filename_ from _metadata_. - - Select a _version_. -2. Get the _version index_ file - - Select a resolution _video index_ along with its _audio index_ and _subtitle index_ -3. Get the subtitles in `vtt` format and convert them to `srt` -4. Feed the _video index_, _audio index_ and `srt` file to `ffmpeg` +1. Get the _config_ API object for the _program identifier_. + - Select a _rendition_. +2. Get the _master playlist_. + - Select a _variant_. +3. Download audio, video and subtitles media content. + - convert `VTT` subtitles to `SRT` +4. Figure out the _output filename_ from _metadata_. +5. Feed the all the media to `ffmpeg` for _muxing_ ### πŸ“½οΈ FFMPEG -The actual build of the video file is handled by [ffmpeg](https://ffmpeg.org/). The script expects [ffmpeg](https://ffmpeg.org/) to be installed in the environement and will call it as a subprocess. +The multiplexing (_muxing_) the video file is handled by [ffmpeg](https://ffmpeg.org/). The script expects [ffmpeg](https://ffmpeg.org/) to be installed in the environement and will call it as a subprocess. -#### Why not use FFMPEG direcly with the _version index_ URL ? +#### Why not use FFMPEG direcly with the HLS _master playlist_ URL ? -So we can select the video resolution and not rely on stream mapping arguments in `ffmpeg`. +So we can be more granular about _renditions_ and _variants_ that we want. -#### Why not use VTT subtitles direcly ? +#### Why not use `VTT` subtitles direcly ? Because it fails πŸ˜’. -#### Why not use FFMPEG direcly with the _video_ and _audio_ _index_ URL ? +#### Why not use FFMPEG direcly with the _media playalist_ URLs and let it do the download ? -Because some programs would randomly fail πŸ˜’. Probably due to invalid _chunking_ on the server. +Because some programs would randomly fail πŸ˜’. Probably due to invalid _segmentation_ on the server. ### πŸ“Œ Dependences -- [m3u8](https://pypi.org/project/m3u8/) to parse index files. +- [m3u8](https://pypi.org/project/m3u8/) to parse playlists. - [webvtt-py](https://pypi.org/project/webvtt-py/) to load `vtt` subtitles files. ### 🀝 Help diff --git a/src/delarte/__main__.py b/src/delarte/__main__.py index 66005c5..8469a04 100644 --- a/src/delarte/__main__.py +++ b/src/delarte/__main__.py @@ -19,23 +19,20 @@ from . import naming from . import www -def fail(message, code=1): - """Print a message to STDERR and return a given exit code.""" +def _fail(message, code=1): print(message, file=sys.stderr) return code -def print_available_versions(config, f): - """Print available program versions.""" +def _print_available_renditions(config, f): print(f"Available versions:", file=f) - for code, label in api.iter_versions(config): + for code, label in api.iter_renditions(config): print(f"\t{code} - {label}", file=f) -def print_available_resolutions(version_index, f): - """Print available version resolutions.""" +def _print_available_variants(version_index, f): print(f"Available resolutions:", file=f) - for code, label in hls.iter_resolutions(version_index): + for code, label in hls.iter_variants(version_index): print(f"\t{code} - {label}", file=f) @@ -76,33 +73,33 @@ def main(): try: www_lang, program_id = www.parse_url(args.pop(0)) except ValueError as e: - return fail(f"Invalid url: {e}") + return _fail(f"Invalid url: {e}") try: config = api.load_config(www_lang, program_id) except ValueError: - return fail("Invalid program") + return _fail("Invalid program") if not args: - print_available_versions(config, sys.stdout) + _print_available_renditions(config, sys.stdout) return 0 - version_index_url = api.select_version(config, args.pop(0)) - if version_index_url is None: - fail("Invalid version") - print_available_versions(config, sys.stderr) + master_playlist_url = api.select_rendition(config, args.pop(0)) + if master_playlist_url is None: + _fail("Invalid version") + _print_available_renditions(config, sys.stderr) return 1 - version_index = hls.load_version_index(version_index_url) + master_playlist = hls.load_master_playlist(master_playlist_url) if not args: - print_available_resolutions(version_index, sys.stdout) + _print_available_variants(master_playlist, sys.stdout) return 0 - remote_inputs = hls.select_resolution(version_index, args.pop(0)) + remote_inputs = hls.select_variant(master_playlist, args.pop(0)) if remote_inputs is None: - fail("Invalid resolution") - print_available_resolutions(version_index, sys.stderr) + _fail("Invalid resolution") + _print_available_variants(master_playlist, sys.stderr) return 0 file_base_name = naming.build_file_base_name(config) diff --git a/src/delarte/api.py b/src/delarte/api.py index c492577..d58f049 100644 --- a/src/delarte/api.py +++ b/src/delarte/api.py @@ -38,21 +38,21 @@ def load_config(lang, program_id): return config -def iter_versions(config): - """Return a (code, label, index_url) iterator.""" +def iter_renditions(config): + """Return a rendition (code, label) iterator.""" for stream in config["attributes"]["streams"]: yield ( - # version code + # rendition code stream["versions"][0]["eStat"]["ml5"], - # version full name + # rendition full name stream["versions"][0]["label"], ) -def select_version(config, version_code): - """Return the version index url for the given version code.""" +def select_rendition(config, rendition_code): + """Return the master playlist index url for the given rendition code.""" for stream in config["attributes"]["streams"]: - if stream["versions"][0]["eStat"]["ml5"] == version_code: + if stream["versions"][0]["eStat"]["ml5"] == rendition_code: return stream["url"] return None diff --git a/src/delarte/hls.py b/src/delarte/hls.py index c356478..c0cfb1f 100644 --- a/src/delarte/hls.py +++ b/src/delarte/hls.py @@ -3,6 +3,60 @@ """Provide HLS protocol utilities.""" +# For terminology, from HLS protocol RFC8216 + +# 2. Overview +# +# A multimedia presentation is specified by a Uniform Resource +# Identifier (URI) [RFC3986] to a Playlist. +# +# A Playlist is either a Media Playlist or a Master Playlist. Both are +# UTF-8 text files containing URIs and descriptive tags. +# +# A Media Playlist contains a list of Media Segments, which, when +# played sequentially, will play the multimedia presentation. +# +# Here is an example of a Media Playlist: +# +# #EXTM3U +# #EXT-X-TARGETDURATION:10 +# +# #EXTINF:9.009, +# http://media.example.com/first.ts +# #EXTINF:9.009, +# http://media.example.com/second.ts +# #EXTINF:3.003, +# http://media.example.com/third.ts +# +# The first line is the format identifier tag #EXTM3U. The line +# containing #EXT-X-TARGETDURATION says that all Media Segments will be +# 10 seconds long or less. Then, three Media Segments are declared. +# The first and second are 9.009 seconds long; the third is 3.003 +# seconds. +# +# To play this Playlist, the client first downloads it and then +# downloads and plays each Media Segment declared within it. The +# client reloads the Playlist as described in this document to discover +# any added segments. Data SHOULD be carried over HTTP [RFC7230], but, +# in general, a URI can specify any protocol that can reliably transfer +# the specified resource on demand. +# +# A more complex presentation can be described by a Master Playlist. A +# Master Playlist provides a set of Variant Streams, each of which +# describes a different version of the same content. +# +# A Variant Stream includes a Media Playlist that specifies media +# encoded at a particular bit rate, in a particular format, and at a +# particular resolution for media containing video. +# +# A Variant Stream can also specify a set of Renditions. Renditions +# are alternate versions of the content, such as audio produced in +# different languages or video recorded from different camera angles. +# +# Clients should switch between different Variant Streams to adapt to +# network conditions. Clients should choose Renditions based on user +# preferences. + import contextlib import io import os @@ -16,68 +70,107 @@ from urllib.request import urlopen import m3u8 import webvtt +# +# WARNING ! +# +# This module does not aim for a full implementation of HLS, only the +# subset usefull for the actual observed usage of ArteTV. +# +# - URIs are relative file paths +# - Master playlists have at least one variant +# - Every variant is of different resolution +# - Every variant has exactly one audio medium +# - Every variant has at most one subtitles medium +# - Audio and video media playlists segments are incrmental ranges of the same file +# - Subtitles media playlists have only one segment -def load_version_index(url): - """Retrieve a version from m3u8 file.""" - version_index = m3u8.load(url) - if not version_index.playlists: +def _make_resolution_code(variant): + # resolution code (1080p, 720p, ...) + return f"{variant.stream_info.resolution[1]}p" + + +def _is_relative_file_path(uri): + try: + url = urlparse(uri) + return url.path == uri and not uri.startswith("/") + except ValueError: + return False + + +def load_master_playlist(url): + """Download and return a master playlist.""" + master_playlist = m3u8.load(url) + + if not master_playlist.playlists: raise ValueError("Unexpected missing playlists") - for pl in version_index.playlists: - count = 0 - for m in pl.media: + resolution_codes = set() + + for variant in master_playlist.playlists: + resolution_code = _make_resolution_code(variant) + + if resolution_code in resolution_codes: + raise ValueError("Unexpected duplicate resolution") + resolution_codes.add(resolution_code) + + audio_media = False + subtitles_media = False + + for m in variant.media: + if not _is_relative_file_path(m.uri): + raise ValueError("Invalid relative file name") + if m.type == "AUDIO": - count += 1 - if count != 1: - raise ValueError("Unexpected missing or multiple audio tracks") + if audio_media: + raise ValueError("Unexpected multiple audio tracks") + audio_media = True - count = 0 - for m in pl.media: - if m.type == "SUBTITLES": - count += 1 - if count > 1: - raise ValueError("Unexpected multiple subtitle tracks") + elif m.type == "SUBTITLES": + if subtitles_media: + raise ValueError("Unexpected multiple subtitles tracks") + subtitles_media = True - return version_index + if not audio_media: + raise ValueError("Unexpected missing audio track") + + return master_playlist -def iter_resolutions(version_index): - """Iterate over resolution options.""" - for pl in sorted( - version_index.playlists, - key=lambda pl: pl.stream_info.resolution[1], +def iter_variants(master_playlist): + """Iterate over variants.""" + for variant in sorted( + master_playlist.playlists, + key=lambda v: v.stream_info.resolution[1], reverse=True, ): yield ( - # resolution code (1080p, 720p, ...) - f"{pl.stream_info.resolution[1]}p", - # resolution label - f"{pl.stream_info.resolution[0]} x {pl.stream_info.resolution[1]}", + _make_resolution_code(variant), + f"{variant.stream_info.resolution[0]} x {variant.stream_info.resolution[1]}", ) -def select_resolution(version_index, resolution_code): - """Return the stream information for a given resolution_code.""" - for pl in version_index.playlists: - code = f"{pl.stream_info.resolution[1]}p" +def select_variant(master_playlist, resolution_code): + """Return the stream information for a given resolution code.""" + for variant in master_playlist.playlists: + code = _make_resolution_code(variant) if code != resolution_code: continue audio_track = None - for m in pl.media: + for m in variant.media: if m.type == "AUDIO": - audio_track = (m.language, pl.base_uri + m.uri) + audio_track = (m.language, variant.base_uri + m.uri) break subtitles_track = None - for m in pl.media: + for m in variant.media: if m.type == "SUBTITLES": - subtitles_track = (m.language, pl.base_uri + m.uri) + subtitles_track = (m.language, variant.base_uri + m.uri) break return ( - pl.base_uri + pl.uri, + variant.base_uri + variant.uri, audio_track, subtitles_track, ) @@ -85,28 +178,27 @@ def select_resolution(version_index, resolution_code): return None -def parse_byterange(obj): - """Parse a M3U8 `byterange` (count@offset) into http range (range_start, rang_end).""" +def _parse_byterange(obj): + # Parse a M3U8 `byterange` (count@offset) into http range (range_start, rang_end) count, offset = [int(v) for v in obj.byterange.split("@")] return offset, offset + count - 1 -def load_av_index(index_url): - """Load a M3U8 audio or video index.""" - index = m3u8.load(index_url) +def _load_av_segments(media_playlist_url): + media_playlist = m3u8.load(media_playlist_url) - file_name = index.segment_map[0].uri - range_start, range_end = parse_byterange(index.segment_map[0]) + file_name = media_playlist.segment_map[0].uri + range_start, range_end = _parse_byterange(media_playlist.segment_map[0]) if range_start != 0: raise ValueError("Invalid a/v index: does not start at 0") chunks = [(range_start, range_end)] total = range_end + 1 - for segment in index.segments: + for segment in media_playlist.segments: if segment.uri != file_name: raise ValueError("Invalid a/v index: multiple file names") - range_start, range_end = parse_byterange(segment) + range_start, range_end = _parse_byterange(segment) if range_start != total: raise ValueError( f"Invalid a/v index: discontious ranges ({range_start} != {total})" @@ -115,12 +207,12 @@ def load_av_index(index_url): chunks.append((range_start, range_end)) total = range_end + 1 - return urlparse(index.segment_map[0].absolute_uri), chunks + return urlparse(media_playlist.segment_map[0].absolute_uri), chunks -def download_av_input(index_url, progress): - """Download an audio or video stream to temporary directory.""" - url, ranges = load_av_index(index_url) +def _download_av_stream(media_playlist_url, progress): + # Download an audio or video stream to temporary directory + url, ranges = _load_av_segments(media_playlist_url) total = ranges[-1][1] Connector = HTTPSConnection if url.scheme == "https" else HTTPConnection @@ -166,8 +258,8 @@ def download_av_input(index_url, progress): return f.name -def download_subtitles_input(index_url, progress): - """Return a temporary file name where VTT subtitle has been downloaded/converted to SRT.""" +def _download_subtitles_input(index_url, progress): + # Return a temporary file name where VTT subtitle has been downloaded/converted to SRT subtitles_index = m3u8.load(index_url) urls = [subtitles_index.base_uri + "/" + f for f in subtitles_index.files] @@ -215,18 +307,18 @@ def download_inputs(remote_inputs, progress): subtitles_filename = None try: - video_filename = download_av_input( + video_filename = _download_av_stream( video_index_url, lambda i, n: progress("video", i, n) ) (audio_lang, audio_index_url) = audio_track - audio_filename = download_av_input( + audio_filename = _download_av_stream( audio_index_url, lambda i, n: progress("audio", i, n) ) if subtitles_track: (subtitles_lang, subtitles_index_url) = subtitles_track - subtitles_filename = download_subtitles_input( + subtitles_filename = _download_subtitles_input( subtitles_index_url, lambda i, n: progress("subtitles", i, n) ) From 14b9c1bb11c1ff5b99d99863958bbce1fcb35dc7 Mon Sep 17 00:00:00 2001 From: Barbagus Date: Wed, 14 Dec 2022 13:55:56 +0100 Subject: [PATCH 2/2] Adds the tests directory --- tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29