Merge pull request 'Fix #24 and #25' (#26) from vtt2srt into stable

Reviewed-on: #26
This commit is contained in:
Barbagus 2023-01-06 00:24:56 +00:00
commit 914f711670
6 changed files with 5641 additions and 28 deletions

View File

@ -143,7 +143,7 @@ So we can be more granular about _renditions_ and _variants_ that we want.
### Why not use `VTT` subtitles directly ?
Because it fails 😒.
Because FFMPEG do not support styles in WebVTT 😒.
### Why not use FFMPEG directly with the _media playlist_ URLs and let it do the download ?
@ -153,7 +153,6 @@ Because some programs would randomly fail 😒. Probably due to invalid _segment
## 📌 Dependencies
- [m3u8](https://pypi.org/project/m3u8/) to parse playlists.
- [webvtt-py](https://pypi.org/project/webvtt-py/) to load `vtt` subtitles files.
- [requests](https://pypi.org/project/requests/) to handle HTTP traffic.
## 🤝 Help

View File

@ -11,7 +11,6 @@ classifiers = ["License :: OSI Approved :: GNU Affero General Public License v3"
dynamic = ["version", "description"]
dependencies = [
"m3u8",
"webvtt-py",
"requests",
"docopt-ng"
]

3355
samples/vtt/captions.vtt Normal file

File diff suppressed because it is too large Load Diff

2216
samples/vtt/subtitles.vtt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -58,15 +58,12 @@
# preferences.
import contextlib
import io
import os
import re
from tempfile import NamedTemporaryFile
import m3u8
import webvtt
from . import error, model
from . import error, model, subtitles
#
# WARNING !
@ -279,24 +276,13 @@ def _download_subtitles_media(http_session, media_playlist_url, progress):
progress(0, 2)
r = http_session.get(url)
r.raise_for_status()
buffer = io.StringIO(r.text)
r.encoding = "utf-8"
progress(1, 2)
with NamedTemporaryFile(
"w", delete=False, prefix="delarte.", suffix=".srt", encoding="utf8"
) as f:
i = 1
for caption in webvtt.read_buffer(buffer):
print(i, file=f)
print(
re.sub(r"\.", ",", caption.start)
+ " --> "
+ re.sub(r"\.", ",", caption.end),
file=f,
)
print(caption.text + "\n", file=f)
i += 1
subtitles.convert(r.text, f)
progress(2, 2)
return f.name
@ -313,14 +299,6 @@ def download_source(http_session, source, progress):
subtitles_filename = None
try:
video_filename = _download_av_media(
http_session, source.video, lambda i, n: progress("video", i, n)
)
audio_filename = _download_av_media(
http_session, source.audio, lambda i, n: progress("audio", i, n)
)
subtitles_filename = (
_download_subtitles_media(
http_session,
@ -331,6 +309,14 @@ def download_source(http_session, source, progress):
else None
)
video_filename = _download_av_media(
http_session, source.video, lambda i, n: progress("video", i, n)
)
audio_filename = _download_av_media(
http_session, source.audio, lambda i, n: progress("audio", i, n)
)
yield model.Source(
source.metadata,
source.rendition,

58
src/delarte/subtitles.py Normal file
View File

@ -0,0 +1,58 @@
# License: GNU AGPL v3: http://www.gnu.org/licenses/
# This file is part of `delarte` (https://git.afpy.org/fcode/delarte.git)
"""Provide WebVTT to SRT subtitles conversion."""
import re
from . import error
class Error(error.UnexpectedError):
"""Unexpected WebVTT data."""
RE_CUE_START = r"^((?:\d\d:)\d\d:\d\d)\.(\d\d\d) --> ((?:\d\d:)\d\d:\d\d)\.(\d\d\d)"
RE_STYLED_CUE = r"^<c\.(\w+)\.bg_(?:\w+)>(.*)</c>$"
def convert(input, output):
"""Convert input ArteTV's WebVTT string data and write it on output file."""
# This is a very (very) simple implementation based on what has actually
# been seen on ArteTV and is not at all a generic WebVTT solution.
blocks = []
block = []
for line in input.splitlines():
if not line and block:
blocks.append(block)
block = []
else:
block.append(line)
if block:
blocks.append(block)
block = []
if not blocks:
raise Error("INVALID_DATA")
header = blocks.pop(0)
if not (len(header) == 1 and header[0].startswith("WEBVTT")):
raise Error("INVALID_HEADER")
counter = 1
for block in blocks:
if m := re.match(RE_CUE_START, block.pop(0)):
print(f"{counter}", file=output)
print(f"{m[1]},{m[2]} --> {m[3]},{m[4]}", file=output)
for line in block:
if m := re.match(RE_STYLED_CUE, line):
print(f'<font color="{m[1]}">{m[2]}</font>', file=output)
else:
print(line, file=output)
print("", file=output)
counter += 1
if counter == 1:
raise Error("EMPTY_DATA")