Get program information from page content

Changes the way the program information is figured out. From URL parsing
to page content parsing.
A massive JSON object is shipped within the HTML of the page, that's
were we get what we need from.

Side effects:
 - drop `slug` from the program's info
 - drop `slug` naming option
 - no `Program` / `ProgramMeta` distinction

Includes some JSON samples.
This commit is contained in:
Barbagus 2023-01-14 19:51:02 +01:00
parent ba2dd96b36
commit 639a8063a5
9 changed files with 6532 additions and 63 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -13,16 +13,14 @@ def fetch_sources(http_session, url):
"""Fetch sources at a given ArteTV page URL.""" """Fetch sources at a given ArteTV page URL."""
from .api import fetch_program_info from .api import fetch_program_info
from .hls import fetch_program_tracks from .hls import fetch_program_tracks
from .www import parse_url from .www import fetch_program
site, program_id, slug = parse_url(url) p_meta = fetch_program(http_session, url)
variants = dict() variants = dict()
renditions = dict() renditions = dict()
p_meta, program_index_urls = fetch_program_info(http_session, site, program_id) program_index_urls = fetch_program_info(http_session, p_meta)
program = Program(program_id, slug, p_meta)
for program_index_url in program_index_urls: for program_index_url in program_index_urls:
v_tracks, a_track, s_track = fetch_program_tracks( v_tracks, a_track, s_track = fetch_program_tracks(
@ -43,7 +41,7 @@ def fetch_sources(http_session, url):
raise ValueError raise ValueError
return Sources( return Sources(
program, p_meta,
[Variant(key, source) for key, source in variants.items()], [Variant(key, source) for key, source in variants.items()],
[Rendition(key, source) for key, source in renditions.items()], [Rendition(key, source) for key, source in renditions.items()],
) )
@ -146,7 +144,7 @@ def compile_sources(sources, **naming_options):
build_file_name = file_name_builder(v_meta, a_meta, s_meta, **naming_options) build_file_name = file_name_builder(v_meta, a_meta, s_meta, **naming_options)
return Target( return Target(
sources.program.meta, sources.program,
VideoTrack(v_meta, v_url), VideoTrack(v_meta, v_url),
AudioTrack(a_meta, a_url), AudioTrack(a_meta, a_url),
SubtitlesTrack(s_meta, s_url) if s_meta else None, SubtitlesTrack(s_meta, s_url) if s_meta else None,

View File

@ -23,7 +23,6 @@ Options:
--version print current version of the program --version print current version of the program
--debug on error, print debugging information --debug on error, print debugging information
--name-use-id use the program ID --name-use-id use the program ID
--name-use-slug use the URL slug
--name-sep=<sep> field separator [default: - ] --name-sep=<sep> field separator [default: - ]
--name-seq-pfx=<pfx> sequence counter prefix [default: - ] --name-seq-pfx=<pfx> sequence counter prefix [default: - ]
--name-seq-no-pad disable sequence zero-padding --name-seq-no-pad disable sequence zero-padding

View File

@ -27,38 +27,34 @@ def _fetch_api_data(http_session, path, object_type):
return obj["attributes"] return obj["attributes"]
def fetch_program_info(http_session, site, program_id): def fetch_program_info(http_session, p_meta):
"""Fetch the given program metadata and indexes.""" """Fetch the given program metadata and indexes."""
obj = _fetch_api_data(http_session, f"config/{site}/{program_id}", "ConfigPlayer") obj = _fetch_api_data(
http_session, f"config/{p_meta.site}/{p_meta.id}", "ConfigPlayer"
)
if (_ := obj["metadata"]["providerId"]) != program_id: if (_ := obj["metadata"]["providerId"]) != p_meta.id:
raise UnexpectedAPIResponse( raise UnexpectedAPIResponse(
"PROGRAM_ID_MISMATCH", "PROGRAM_ID_MISMATCH",
site, p_meta.site,
program_id, p_meta.id,
_, _,
) )
program_meta = ProgramMeta(
obj["metadata"]["title"],
obj["metadata"]["subtitle"],
obj["metadata"]["description"],
)
program_index_urls = set() program_index_urls = set()
for s in obj["streams"]: for s in obj["streams"]:
if (_ := s["protocol"]) != "HLS_NG": if (_ := s["protocol"]) != "HLS_NG":
raise UnsupportedHLSProtocol(site, program_id, _) raise UnsupportedHLSProtocol(p_meta.site, p_meta.id, _)
if (program_index_url := s["url"]) in program_index_urls: if (program_index_url := s["url"]) in program_index_urls:
raise UnexpectedAPIResponse( raise UnexpectedAPIResponse(
"DUPLICATE_PROGRAM_INDEX_URL", "DUPLICATE_PROGRAM_INDEX_URL",
site, p_meta.site,
program_id, p_meta.id,
program_index_url, program_index_url,
) )
program_index_urls.add(program_index_url) program_index_urls.add(program_index_url)
return program_meta, program_index_urls return program_index_urls

View File

@ -16,14 +16,32 @@ class ModuleError(Exception):
return f"{self.__class__}{self.args!r}" return f"{self.__class__}{self.args!r}"
class ExpectedError(ModuleError):
"""A feature limitation to submit as an enhancement to developers."""
class UnexpectedError(ModuleError): class UnexpectedError(ModuleError):
"""An error to report to developers.""" """An error to report to developers."""
class InvalidUrl(ModuleError): #
"""Invalid ArteTV URL.""" # www
#
class PageNotFound(ModuleError):
"""Page not found at ArteTV."""
class PageNotSupported(ExpectedError):
"""The page you are trying to download from is not (yet) supported."""
class InvalidPage(UnexpectedError):
"""Invalid ArteTV page."""
#
# Others
#
class UnexpectedAPIResponse(UnexpectedError): class UnexpectedAPIResponse(UnexpectedError):
"""Unexpected response from ArteTV.""" """Unexpected response from ArteTV."""

View File

@ -10,15 +10,18 @@ from typing import NamedTuple, Optional
class ProgramMeta(NamedTuple): class ProgramMeta(NamedTuple):
"""A program metadata.""" """A program metadata."""
site: str
"""The site where it is hosted (fr, de, etc...)."""
id: str
"""The ID."""
title: str title: str
"""The title.""" """The title."""
subtitle: str subtitle: str
"""The subtitle or secondary title.""" """The subtitle or secondary title."""
description: str
"""The description."""
class VideoMeta(NamedTuple): class VideoMeta(NamedTuple):
"""A video track metadata.""" """A video track metadata."""
@ -91,18 +94,10 @@ class Rendition(NamedTuple):
source: tuple[str, Optional[str]] source: tuple[str, Optional[str]]
class Program(NamedTuple):
"""A program representation."""
id: str
slug: str
meta: ProgramMeta
class Sources(NamedTuple): class Sources(NamedTuple):
"""A program's sources.""" """A program's sources."""
program: Program program: ProgramMeta
variants: list[Variant] variants: list[Variant]
renditions: list[Rendition] renditions: list[Rendition]

View File

@ -5,7 +5,7 @@
import re import re
from typing import Optional from typing import Optional
from .model import Program, VideoMeta, AudioMeta, SubtitlesMeta from .model import VideoMeta, AudioMeta, SubtitlesMeta
def file_name_builder( def file_name_builder(
@ -14,7 +14,6 @@ def file_name_builder(
s_meta: Optional[SubtitlesMeta], s_meta: Optional[SubtitlesMeta],
*, *,
use_id=False, use_id=False,
use_slug=False,
sep=" - ", sep=" - ",
seq_pfx=" - ", seq_pfx=" - ",
seq_no_pad=False, seq_no_pad=False,
@ -32,17 +31,14 @@ def file_name_builder(
def replace_sequence_counter(s: str) -> str: def replace_sequence_counter(s: str) -> str:
return re.sub(r"\s+\((\d+)/(\d+)\)", sub_sequence_counter, s) return re.sub(r"\s+\((\d+)/(\d+)\)", sub_sequence_counter, s)
def build_file_name(program: Program) -> str: def build_file_name(p_meta) -> str:
"""Create a file name for given program.""" """Create a file name for given program."""
if use_id: if use_id:
return program.id return p_meta.id
if use_slug: fields = [replace_sequence_counter(p_meta.title)]
return program.slug if p_meta.subtitle:
fields.append(replace_sequence_counter(p_meta.subtitle))
fields = [replace_sequence_counter(program.meta.title)]
if program.meta.subtitle:
fields.add(replace_sequence_counter(program.meta.subtitles))
if add_resolution: if add_resolution:
fields.append(f"{v_meta.height}p") fields.append(f"{v_meta.height}p")

View File

@ -3,28 +3,85 @@
"""Provide ArteTV website utilities.""" """Provide ArteTV website utilities."""
from .error import InvalidUrl from contextlib import contextmanager
import json
BASE = "https://www.arte.tv/" from .error import InvalidPage, PageNotFound, PageNotSupported
SITES = ["fr", "de", "en", "es", "pl", "it"] from .model import ProgramMeta
_DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'
def parse_url(url): @contextmanager
"""Parse ArteTV web URL into target ID and web UI language.""" def _schema_guard(*context):
if not url.startswith(BASE): try:
raise InvalidUrl("BASE", url) yield
except (KeyError, IndexError, ValueError) as e:
raise InvalidPage("SCHEMA", *context, e)
path = url[len(BASE) :].split("/")
site = path.pop(0) def _process_programs_page(page_value):
if site not in SITES: with _schema_guard():
raise InvalidUrl("SITE", url, site) site = page_value["language"]
if (_ := path.pop(0)) != "videos": content_zones = [
raise InvalidUrl("PATH", url, _) zone
for zone in page_value["zones"]
if zone["code"].startswith("program_content_")
]
id = path.pop(0) programs = [
slug = path.pop(0) ProgramMeta(
site, data_item["programId"], data_item["title"], data_item["subtitle"]
)
for zone in content_zones
for data_item in zone["content"]["data"]
if data_item["type"] == "program"
]
return site, id, slug if len(content_zones) != 1:
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
if len(programs) != 1:
raise InvalidPage("PROGRAMS_PROGRAMS_COUNT")
return programs[0]
def fetch_program(http_session, url):
"""Load the ArteTV page at given URL and return list of programs on it."""
r = http_session.get(url)
# special handling of 404
if r.status_code == 404:
raise PageNotFound(url)
# other network errors
r.raise_for_status()
# no HTML parsing required, whe just find the mark
html = r.text
start = html.find(_DATA_MARK)
if start < 0:
raise InvalidPage("DATA_MARK_NOT_FOUND", url)
start += len(_DATA_MARK)
end = html.index("</script>", start)
try:
next_js_data = json.loads(html[start:end].strip())
except json.JSONDecodeError:
raise InvalidPage("INVALID_JSON_DATA", url)
with _schema_guard(url):
initial_page_value = next_js_data["props"]["pageProps"]["initialPage"]["value"]
initial_type = next_js_data["props"]["pageProps"]["initialType"]
try:
match initial_type:
case "programs":
return _process_programs_page(initial_page_value)
case _:
raise PageNotSupported("TYPE_NOT_SUPPORTED", url, initial_type)
except InvalidPage as e:
raise InvalidPage(e.args[0], url, *e.args[1:])