Use `requests` library instead of `urllib`

Enables by default:
- gzip compression
- request pooling
This commit is contained in:
Barbagus 2022-12-20 23:46:44 +01:00
parent 458d4cbb6d
commit 88ffe31a94
5 changed files with 67 additions and 69 deletions

View File

@ -288,6 +288,7 @@ Because some programs would randomly fail 😒. Probably due to invalid _segment
- [m3u8](https://pypi.org/project/m3u8/) to parse playlists.
- [webvtt-py](https://pypi.org/project/webvtt-py/) to load `vtt` subtitles files.
- [requests](https://pypi.org/project/requests/) to handle HTTP traffic.
### 🤝 Help

View File

@ -12,6 +12,7 @@ dynamic = ["version", "description"]
dependencies = [
"m3u8",
"webvtt-py",
"requests"
]
[project.urls]

View File

@ -12,6 +12,8 @@ usage: delarte [-h|--help] - print this message
import sys
import time
import requests
from . import api
from . import hls
from . import muxing
@ -73,13 +75,15 @@ def main():
print(__doc__)
return 0
http_session = requests.sessions.Session()
try:
www_lang, program_id = www.parse_url(args.pop(0))
except ValueError as e:
return _fail(f"Invalid url: {e}")
try:
config = api.load_config(www_lang, program_id)
config = api.load_config(http_session, www_lang, program_id)
except ValueError:
return _fail("Invalid program")
@ -93,7 +97,7 @@ def main():
_print_available_renditions(config, sys.stderr)
return 1
master_playlist = hls.load_master_playlist(master_playlist_url)
master_playlist = hls.load_master_playlist(http_session, master_playlist_url)
if not args:
_print_available_variants(master_playlist, sys.stdout)
@ -109,7 +113,7 @@ def main():
progress = create_progress()
with hls.download_inputs(remote_inputs, progress) as temp_inputs:
with hls.download_inputs(http_session, remote_inputs, progress) as temp_inputs:
muxing.mux(temp_inputs, file_base_name, progress)

View File

@ -3,35 +3,38 @@
"""Provide ArteTV JSON API utilities."""
import json
from http import HTTPStatus
from urllib.request import urlopen
MIME_TYPE = "application/vnd.api+json; charset=utf-8"
def load_api_data(url):
"""Retrieve the root node (infamous "data") of an API call response."""
http_response = urlopen(url)
def _fetch_api_data(http_session, path, object_type):
# Fetch an API object.
url = "https://api.arte.tv/api/player/v2/" + path
if http_response.status != HTTPStatus.OK:
raise RuntimeError("API request failed")
r = http_session.get(url)
if r.status_code == 404:
raise ValueError(f"{url}: not found")
if (
http_response.getheader("Content-Type")
!= "application/vnd.api+json; charset=utf-8"
):
r.raise_for_status()
if r.headers["content-type"] != MIME_TYPE:
raise ValueError("API response not supported")
return json.load(http_response)["data"]
obj = r.json()["data"]
def load_config(lang, program_id):
"""Retrieve a program config from API."""
url = f"https://api.arte.tv/api/player/v2/config/{lang}/{program_id}"
config = load_api_data(url)
if config["type"] != "ConfigPlayer":
if obj["type"] != object_type:
raise ValueError("Invalid API response")
return obj
def load_config(http_session, lang, program_id):
"""Retrieve a program config from API."""
url = f"config/{lang}/{program_id}"
config = _fetch_api_data(
http_session, f"config/{lang}/{program_id}", "ConfigPlayer"
)
if config["attributes"]["metadata"]["providerId"] != program_id:
raise ValueError("Invalid API response")

View File

@ -61,11 +61,8 @@ import contextlib
import io
import os
import re
from http import HTTPStatus
from http.client import HTTPConnection, HTTPSConnection
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse
from urllib.request import urlopen
import m3u8
import webvtt
@ -98,9 +95,16 @@ def _is_relative_file_path(uri):
return False
def load_master_playlist(url):
def _fetch_playlist(http_session, url):
# Fetch a M3U8 playlist
r = http_session.get(url)
r.raise_for_status()
return m3u8.loads(r.text, url)
def load_master_playlist(http_session, url):
"""Download and return a master playlist."""
master_playlist = m3u8.load(url)
master_playlist = _fetch_playlist(http_session, url)
if not master_playlist.playlists:
raise ValueError("Unexpected missing playlists")
@ -184,8 +188,8 @@ def _parse_byterange(obj):
return offset, offset + count - 1
def _load_av_segments(media_playlist_url):
media_playlist = m3u8.load(media_playlist_url)
def _load_av_segments(http_session, media_playlist_url):
media_playlist = _fetch_playlist(http_session, media_playlist_url)
file_name = media_playlist.segment_map[0].uri
range_start, range_end = _parse_byterange(media_playlist.segment_map[0])
@ -207,61 +211,45 @@ def _load_av_segments(media_playlist_url):
chunks.append((range_start, range_end))
total = range_end + 1
return urlparse(media_playlist.segment_map[0].absolute_uri), chunks
return media_playlist.segment_map[0].absolute_uri, chunks
def _download_av_stream(media_playlist_url, progress):
def _download_av_stream(http_session, media_playlist_url, progress):
# Download an audio or video stream to temporary directory
url, ranges = _load_av_segments(media_playlist_url)
url, ranges = _load_av_segments(http_session, media_playlist_url)
total = ranges[-1][1]
Connector = HTTPSConnection if url.scheme == "https" else HTTPConnection
connection = Connector(url.hostname)
connection.connect()
with (
NamedTemporaryFile(
mode="w+b", delete=False, prefix="delarte.", suffix=".mp4"
) as f,
contextlib.closing(connection) as c,
) as f
):
for range_start, range_end in ranges:
c.request(
"GET",
url.path,
r = http_session.get(
url,
headers={
"Accept": "*/*",
"Accept-Language": "fr,en;q=0.7,en-US;q=0.3",
"Accept-Encoding": "gzip, deflate, br, identity",
"Range": f"bytes={range_start}-{range_end}",
"Origin": "https://www.arte.tv",
"Connection": "keep-alive",
"Referer": "https://www.arte.tv/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "cross-site",
"Sec-GPC": "1",
"DNT": "1",
},
)
r = c.getresponse()
if r.status != 206:
r.raise_for_status()
if r.status_code != 206:
raise ValueError(f"Invalid response status {r.status}")
content = r.read()
if len(content) != range_end - range_start + 1:
if len(r.content) != range_end - range_start + 1:
raise ValueError("Invalid range length")
f.write(content)
f.write(r.content)
progress(range_end, total)
return f.name
def _download_subtitles_input(index_url, progress):
def _download_subtitles_input(http_session, index_url, progress):
# Return a temporary file name where VTT subtitle has been downloaded/converted to SRT
subtitles_index = m3u8.load(index_url)
urls = [subtitles_index.base_uri + "/" + f for f in subtitles_index.files]
subtitles_index = _fetch_playlist(http_session, index_url)
urls = [s.absolute_uri for s in subtitles_index.segments]
if not urls:
raise ValueError("No subtitle files")
@ -270,11 +258,10 @@ def _download_subtitles_input(index_url, progress):
raise ValueError("Multiple subtitle files")
progress(0, 2)
http_response = urlopen(urls[0])
if http_response.status != HTTPStatus.OK:
raise RuntimeError("Subtitle request failed")
r = http_session.get(urls[0])
r.raise_for_status()
buffer = io.StringIO(http_response.read().decode("utf8"))
buffer = io.StringIO(r.text)
progress(1, 2)
with NamedTemporaryFile(
@ -296,7 +283,7 @@ def _download_subtitles_input(index_url, progress):
@contextlib.contextmanager
def download_inputs(remote_inputs, progress):
def download_inputs(http_session, remote_inputs, progress):
"""Download inputs in temporary files."""
# It is implemented as a context manager that will delete temporary files on exit.
@ -308,18 +295,20 @@ def download_inputs(remote_inputs, progress):
try:
video_filename = _download_av_stream(
video_index_url, lambda i, n: progress("video", i, n)
http_session, video_index_url, lambda i, n: progress("video", i, n)
)
(audio_lang, audio_index_url) = audio_track
audio_filename = _download_av_stream(
audio_index_url, lambda i, n: progress("audio", i, n)
http_session, audio_index_url, lambda i, n: progress("audio", i, n)
)
if subtitles_track:
(subtitles_lang, subtitles_index_url) = subtitles_track
subtitles_filename = _download_subtitles_input(
subtitles_index_url, lambda i, n: progress("subtitles", i, n)
http_session,
subtitles_index_url,
lambda i, n: progress("subtitles", i, n),
)
yield (