Fix #24 and #25

Remove dependency to `webvtt-py` which was both too much and not enough for our use case. Implement a basic WebVTT to SRT converter according to ArteTV's usage of WebVTT features.
2023-01-06 01:17:55 +01:00 · 2023-01-06 01:17:55 +01:00 · 96f411cca0
parent 8d216215dd
commit 96f411cca0
6 changed files with 5641 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -143,7 +143,7 @@ So we can be more granular about _renditions_ and _variants_ that we want.

 ### Why not use `VTT` subtitles directly ?

-Because it fails 😒.
+Because FFMPEG do not support styles in WebVTT 😒.

 ### Why not use FFMPEG directly with the _media playlist_ URLs and let it do the download ?

@ -153,7 +153,6 @@ Because some programs would randomly fail 😒. Probably due to invalid _segment
 ## 📌 Dependencies

 - [m3u8](https://pypi.org/project/m3u8/) to parse playlists.
- [webvtt-py](https://pypi.org/project/webvtt-py/) to load `vtt` subtitles files.
 - [requests](https://pypi.org/project/requests/) to handle HTTP traffic.

 ## 🤝 Help
--- a/pyproject.toml
+++ b/pyproject.toml
@ -11,7 +11,6 @@ classifiers = ["License :: OSI Approved :: GNU Affero General Public License v3"
 dynamic = ["version", "description"]
 dependencies = [
    "m3u8",
-    "webvtt-py",
    "requests",
    "docopt-ng"
 ]
--- a/samples/vtt/captions.vtt
+++ b/samples/vtt/captions.vtt
--- a/samples/vtt/subtitles.vtt
+++ b/samples/vtt/subtitles.vtt
--- a/src/delarte/hls.py
+++ b/src/delarte/hls.py
@ -58,15 +58,12 @@
 #    preferences.

 import contextlib
-import io
 import os
-import re
 from tempfile import NamedTemporaryFile

 import m3u8
-import webvtt

-from . import error, model
+from . import error, model, subtitles

 #
 # WARNING !
@ -279,24 +276,13 @@ def _download_subtitles_media(http_session, media_playlist_url, progress):
    progress(0, 2)
    r = http_session.get(url)
    r.raise_for_status()
-
-    buffer = io.StringIO(r.text)
+    r.encoding = "utf-8"
    progress(1, 2)

    with NamedTemporaryFile(
        "w", delete=False, prefix="delarte.", suffix=".srt", encoding="utf8"
    ) as f:
-        i = 1
-        for caption in webvtt.read_buffer(buffer):
-            print(i, file=f)
-            print(
-                re.sub(r"\.", ",", caption.start)
-                + " --> "
-                + re.sub(r"\.", ",", caption.end),
-                file=f,
-            )
-            print(caption.text + "\n", file=f)
-            i += 1
+        subtitles.convert(r.text, f)
        progress(2, 2)
        return f.name

@ -313,14 +299,6 @@ def download_source(http_session, source, progress):
    subtitles_filename = None

    try:
-        video_filename = _download_av_media(
-            http_session, source.video, lambda i, n: progress("video", i, n)
-        )
-
-        audio_filename = _download_av_media(
-            http_session, source.audio, lambda i, n: progress("audio", i, n)
-        )
-
        subtitles_filename = (
            _download_subtitles_media(
                http_session,
@ -331,6 +309,14 @@ def download_source(http_session, source, progress):
            else None
        )

+        video_filename = _download_av_media(
+            http_session, source.video, lambda i, n: progress("video", i, n)
+        )
+
+        audio_filename = _download_av_media(
+            http_session, source.audio, lambda i, n: progress("audio", i, n)
+        )
+
        yield model.Source(
            source.metadata,
            source.rendition,
--- a/src/delarte/subtitles.py
+++ b/src/delarte/subtitles.py
@ -0,0 +1,58 @@
+# License: GNU AGPL v3: http://www.gnu.org/licenses/
+# This file is part of `delarte` (https://git.afpy.org/fcode/delarte.git)
+
+"""Provide WebVTT to SRT subtitles conversion."""
+
+import re
+
+from . import error
+
+
+class Error(error.UnexpectedError):
+    """Unexpected WebVTT data."""
+
+
+RE_CUE_START = r"^((?:\d\d:)\d\d:\d\d)\.(\d\d\d) --> ((?:\d\d:)\d\d:\d\d)\.(\d\d\d)"
+RE_STYLED_CUE = r"^<c\.(\w+)\.bg_(?:\w+)>(.*)</c>$"
+
+
+def convert(input, output):
+    """Convert input ArteTV's WebVTT string data and write it on output file."""
+    # This is a very (very) simple implementation based on what has actually
+    # been seen on ArteTV and is not at all a generic WebVTT solution.
+
+    blocks = []
+    block = []
+
+    for line in input.splitlines():
+        if not line and block:
+            blocks.append(block)
+            block = []
+        else:
+            block.append(line)
+    if block:
+        blocks.append(block)
+        block = []
+
+    if not blocks:
+        raise Error("INVALID_DATA")
+
+    header = blocks.pop(0)
+    if not (len(header) == 1 and header[0].startswith("WEBVTT")):
+        raise Error("INVALID_HEADER")
+
+    counter = 1
+    for block in blocks:
+        if m := re.match(RE_CUE_START, block.pop(0)):
+            print(f"{counter}", file=output)
+            print(f"{m[1]},{m[2]} --> {m[3]},{m[4]}", file=output)
+            for line in block:
+                if m := re.match(RE_STYLED_CUE, line):
+                    print(f'<font color="{m[1]}">{m[2]}</font>', file=output)
+                else:
+                    print(line, file=output)
+            print("", file=output)
+            counter += 1
+
+    if counter == 1:
+        raise Error("EMPTY_DATA")