Get program information from page content

Changes the way the program information is figured out. From URL parsing
to page content parsing.
A massive JSON object is shipped within the HTML of the page, that's
were we get what we need from.

Side effects:
 - drop `slug` from the program's info
 - drop `slug` naming option
 - no `Program` / `ProgramMeta` distinction

Includes some JSON samples.
This commit is contained in:
Barbagus 2023-01-14 19:51:02 +01:00
parent ba2dd96b36
commit 639a8063a5
9 changed files with 6532 additions and 63 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -13,16 +13,14 @@ def fetch_sources(http_session, url):
"""Fetch sources at a given ArteTV page URL."""
from .api import fetch_program_info
from .hls import fetch_program_tracks
from .www import parse_url
from .www import fetch_program
site, program_id, slug = parse_url(url)
p_meta = fetch_program(http_session, url)
variants = dict()
renditions = dict()
p_meta, program_index_urls = fetch_program_info(http_session, site, program_id)
program = Program(program_id, slug, p_meta)
program_index_urls = fetch_program_info(http_session, p_meta)
for program_index_url in program_index_urls:
v_tracks, a_track, s_track = fetch_program_tracks(
@ -43,7 +41,7 @@ def fetch_sources(http_session, url):
raise ValueError
return Sources(
program,
p_meta,
[Variant(key, source) for key, source in variants.items()],
[Rendition(key, source) for key, source in renditions.items()],
)
@ -146,7 +144,7 @@ def compile_sources(sources, **naming_options):
build_file_name = file_name_builder(v_meta, a_meta, s_meta, **naming_options)
return Target(
sources.program.meta,
sources.program,
VideoTrack(v_meta, v_url),
AudioTrack(a_meta, a_url),
SubtitlesTrack(s_meta, s_url) if s_meta else None,

View File

@ -23,7 +23,6 @@ Options:
--version print current version of the program
--debug on error, print debugging information
--name-use-id use the program ID
--name-use-slug use the URL slug
--name-sep=<sep> field separator [default: - ]
--name-seq-pfx=<pfx> sequence counter prefix [default: - ]
--name-seq-no-pad disable sequence zero-padding

View File

@ -27,38 +27,34 @@ def _fetch_api_data(http_session, path, object_type):
return obj["attributes"]
def fetch_program_info(http_session, site, program_id):
def fetch_program_info(http_session, p_meta):
"""Fetch the given program metadata and indexes."""
obj = _fetch_api_data(http_session, f"config/{site}/{program_id}", "ConfigPlayer")
obj = _fetch_api_data(
http_session, f"config/{p_meta.site}/{p_meta.id}", "ConfigPlayer"
)
if (_ := obj["metadata"]["providerId"]) != program_id:
if (_ := obj["metadata"]["providerId"]) != p_meta.id:
raise UnexpectedAPIResponse(
"PROGRAM_ID_MISMATCH",
site,
program_id,
p_meta.site,
p_meta.id,
_,
)
program_meta = ProgramMeta(
obj["metadata"]["title"],
obj["metadata"]["subtitle"],
obj["metadata"]["description"],
)
program_index_urls = set()
for s in obj["streams"]:
if (_ := s["protocol"]) != "HLS_NG":
raise UnsupportedHLSProtocol(site, program_id, _)
raise UnsupportedHLSProtocol(p_meta.site, p_meta.id, _)
if (program_index_url := s["url"]) in program_index_urls:
raise UnexpectedAPIResponse(
"DUPLICATE_PROGRAM_INDEX_URL",
site,
program_id,
p_meta.site,
p_meta.id,
program_index_url,
)
program_index_urls.add(program_index_url)
return program_meta, program_index_urls
return program_index_urls

View File

@ -16,14 +16,32 @@ class ModuleError(Exception):
return f"{self.__class__}{self.args!r}"
class ExpectedError(ModuleError):
"""A feature limitation to submit as an enhancement to developers."""
class UnexpectedError(ModuleError):
"""An error to report to developers."""
class InvalidUrl(ModuleError):
"""Invalid ArteTV URL."""
#
# www
#
class PageNotFound(ModuleError):
"""Page not found at ArteTV."""
class PageNotSupported(ExpectedError):
"""The page you are trying to download from is not (yet) supported."""
class InvalidPage(UnexpectedError):
"""Invalid ArteTV page."""
#
# Others
#
class UnexpectedAPIResponse(UnexpectedError):
"""Unexpected response from ArteTV."""

View File

@ -10,15 +10,18 @@ from typing import NamedTuple, Optional
class ProgramMeta(NamedTuple):
"""A program metadata."""
site: str
"""The site where it is hosted (fr, de, etc...)."""
id: str
"""The ID."""
title: str
"""The title."""
subtitle: str
"""The subtitle or secondary title."""
description: str
"""The description."""
class VideoMeta(NamedTuple):
"""A video track metadata."""
@ -91,18 +94,10 @@ class Rendition(NamedTuple):
source: tuple[str, Optional[str]]
class Program(NamedTuple):
"""A program representation."""
id: str
slug: str
meta: ProgramMeta
class Sources(NamedTuple):
"""A program's sources."""
program: Program
program: ProgramMeta
variants: list[Variant]
renditions: list[Rendition]

View File

@ -5,7 +5,7 @@
import re
from typing import Optional
from .model import Program, VideoMeta, AudioMeta, SubtitlesMeta
from .model import VideoMeta, AudioMeta, SubtitlesMeta
def file_name_builder(
@ -14,7 +14,6 @@ def file_name_builder(
s_meta: Optional[SubtitlesMeta],
*,
use_id=False,
use_slug=False,
sep=" - ",
seq_pfx=" - ",
seq_no_pad=False,
@ -32,17 +31,14 @@ def file_name_builder(
def replace_sequence_counter(s: str) -> str:
return re.sub(r"\s+\((\d+)/(\d+)\)", sub_sequence_counter, s)
def build_file_name(program: Program) -> str:
def build_file_name(p_meta) -> str:
"""Create a file name for given program."""
if use_id:
return program.id
return p_meta.id
if use_slug:
return program.slug
fields = [replace_sequence_counter(program.meta.title)]
if program.meta.subtitle:
fields.add(replace_sequence_counter(program.meta.subtitles))
fields = [replace_sequence_counter(p_meta.title)]
if p_meta.subtitle:
fields.append(replace_sequence_counter(p_meta.subtitle))
if add_resolution:
fields.append(f"{v_meta.height}p")

View File

@ -3,28 +3,85 @@
"""Provide ArteTV website utilities."""
from .error import InvalidUrl
from contextlib import contextmanager
import json
BASE = "https://www.arte.tv/"
SITES = ["fr", "de", "en", "es", "pl", "it"]
from .error import InvalidPage, PageNotFound, PageNotSupported
from .model import ProgramMeta
_DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'
def parse_url(url):
"""Parse ArteTV web URL into target ID and web UI language."""
if not url.startswith(BASE):
raise InvalidUrl("BASE", url)
@contextmanager
def _schema_guard(*context):
try:
yield
except (KeyError, IndexError, ValueError) as e:
raise InvalidPage("SCHEMA", *context, e)
path = url[len(BASE) :].split("/")
site = path.pop(0)
def _process_programs_page(page_value):
if site not in SITES:
raise InvalidUrl("SITE", url, site)
with _schema_guard():
site = page_value["language"]
if (_ := path.pop(0)) != "videos":
raise InvalidUrl("PATH", url, _)
content_zones = [
zone
for zone in page_value["zones"]
if zone["code"].startswith("program_content_")
]
id = path.pop(0)
slug = path.pop(0)
programs = [
ProgramMeta(
site, data_item["programId"], data_item["title"], data_item["subtitle"]
)
for zone in content_zones
for data_item in zone["content"]["data"]
if data_item["type"] == "program"
]
return site, id, slug
if len(content_zones) != 1:
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
if len(programs) != 1:
raise InvalidPage("PROGRAMS_PROGRAMS_COUNT")
return programs[0]
def fetch_program(http_session, url):
"""Load the ArteTV page at given URL and return list of programs on it."""
r = http_session.get(url)
# special handling of 404
if r.status_code == 404:
raise PageNotFound(url)
# other network errors
r.raise_for_status()
# no HTML parsing required, whe just find the mark
html = r.text
start = html.find(_DATA_MARK)
if start < 0:
raise InvalidPage("DATA_MARK_NOT_FOUND", url)
start += len(_DATA_MARK)
end = html.index("</script>", start)
try:
next_js_data = json.loads(html[start:end].strip())
except json.JSONDecodeError:
raise InvalidPage("INVALID_JSON_DATA", url)
with _schema_guard(url):
initial_page_value = next_js_data["props"]["pageProps"]["initialPage"]["value"]
initial_type = next_js_data["props"]["pageProps"]["initialType"]
try:
match initial_type:
case "programs":
return _process_programs_page(initial_page_value)
case _:
raise PageNotSupported("TYPE_NOT_SUPPORTED", url, initial_type)
except InvalidPage as e:
raise InvalidPage(e.args[0], url, *e.args[1:])