Get program information from page content

Changes the way the program information is figured out. From URL parsing to page content parsing. A massive JSON object is shipped within the HTML of the page, that's were we get what we need from. Side effects: - drop `slug` from the program's info - drop `slug` naming option - no `Program` / `ProgramMeta` distinction Includes some JSON samples.
2023-01-14 19:51:02 +01:00 · 2023-01-14 19:51:02 +01:00 · 639a8063a5
commit 639a8063a5
parent ba2dd96b36
9 changed files with 6532 additions and 63 deletions
--- a/samples/www/103011-000-A__l-assaut-du-capitole.json
+++ b/samples/www/103011-000-A__l-assaut-du-capitole.json
--- a/samples/www/109041-002-A__acquitted-saison-1-2-10.json
+++ b/samples/www/109041-002-A__acquitted-saison-1-2-10.json
--- a/src/delarte/init.py
+++ b/src/delarte/init.py
@ -13,16 +13,14 @@ def fetch_sources(http_session, url):
    """Fetch sources at a given ArteTV page URL."""
    from .api import fetch_program_info
    from .hls import fetch_program_tracks
-    from .www import parse_url
+    from .www import fetch_program
-    site, program_id, slug = parse_url(url)
+    p_meta = fetch_program(http_session, url)
    variants = dict()
    renditions = dict()
-    p_meta, program_index_urls = fetch_program_info(http_session, site, program_id)
+    program_index_urls = fetch_program_info(http_session, p_meta)
    program = Program(program_id, slug, p_meta)
    for program_index_url in program_index_urls:
        v_tracks, a_track, s_track = fetch_program_tracks(
@ -43,7 +41,7 @@ def fetch_sources(http_session, url):
            raise ValueError
    return Sources(
-        program,
+        p_meta,
        [Variant(key, source) for key, source in variants.items()],
        [Rendition(key, source) for key, source in renditions.items()],
    )
@ -146,7 +144,7 @@ def compile_sources(sources, **naming_options):
    build_file_name = file_name_builder(v_meta, a_meta, s_meta, **naming_options)
    return Target(
-        sources.program.meta,
+        sources.program,
        VideoTrack(v_meta, v_url),
        AudioTrack(a_meta, a_url),
        SubtitlesTrack(s_meta, s_url) if s_meta else None,
--- a/src/delarte/main.py
+++ b/src/delarte/main.py
@ -23,7 +23,6 @@ Options:
  --version              print current version of the program
  --debug                on error, print debugging information
  --name-use-id          use the program ID
  --name-use-slug        use the URL slug
  --name-sep=<sep>       field separator [default:  - ]
  --name-seq-pfx=<pfx>   sequence counter prefix [default:  - ]
  --name-seq-no-pad      disable sequence zero-padding
--- a/src/delarte/api.py
+++ b/src/delarte/api.py
@ -27,38 +27,34 @@ def _fetch_api_data(http_session, path, object_type):
    return obj["attributes"]
-def fetch_program_info(http_session, site, program_id):
+def fetch_program_info(http_session, p_meta):
    """Fetch the given program metadata and indexes."""
-    obj = _fetch_api_data(http_session, f"config/{site}/{program_id}", "ConfigPlayer")
+    obj = _fetch_api_data(
        http_session, f"config/{p_meta.site}/{p_meta.id}", "ConfigPlayer"
    )
-    if (_ := obj["metadata"]["providerId"]) != program_id:
+    if (_ := obj["metadata"]["providerId"]) != p_meta.id:
        raise UnexpectedAPIResponse(
            "PROGRAM_ID_MISMATCH",
-            site,
+            p_meta.site,
-            program_id,
+            p_meta.id,
            _,
        )
    program_meta = ProgramMeta(
        obj["metadata"]["title"],
        obj["metadata"]["subtitle"],
        obj["metadata"]["description"],
    )
    program_index_urls = set()
    for s in obj["streams"]:
        if (_ := s["protocol"]) != "HLS_NG":
-            raise UnsupportedHLSProtocol(site, program_id, _)
+            raise UnsupportedHLSProtocol(p_meta.site, p_meta.id, _)
        if (program_index_url := s["url"]) in program_index_urls:
            raise UnexpectedAPIResponse(
                "DUPLICATE_PROGRAM_INDEX_URL",
-                site,
+                p_meta.site,
-                program_id,
+                p_meta.id,
                program_index_url,
            )
        program_index_urls.add(program_index_url)
-    return program_meta, program_index_urls
+    return program_index_urls
--- a/src/delarte/error.py
+++ b/src/delarte/error.py
@ -16,14 +16,32 @@ class ModuleError(Exception):
        return f"{self.__class__}{self.args!r}"
 class ExpectedError(ModuleError):
    """A feature limitation to submit as an enhancement to developers."""
 class UnexpectedError(ModuleError):
    """An error to report to developers."""
-class InvalidUrl(ModuleError):
+#
-    """Invalid ArteTV URL."""
+# www
 #
 class PageNotFound(ModuleError):
    """Page not found at ArteTV."""
 class PageNotSupported(ExpectedError):
    """The page you are trying to download from is not (yet) supported."""
 class InvalidPage(UnexpectedError):
    """Invalid ArteTV page."""
 #
 # Others
 #
 class UnexpectedAPIResponse(UnexpectedError):
    """Unexpected response from ArteTV."""
--- a/src/delarte/model.py
+++ b/src/delarte/model.py
@ -10,15 +10,18 @@ from typing import NamedTuple, Optional
 class ProgramMeta(NamedTuple):
    """A program metadata."""
    site: str
    """The site where it is hosted (fr, de, etc...)."""
    id: str
    """The ID."""
    title: str
    """The title."""
    subtitle: str
    """The subtitle or secondary title."""
    description: str
    """The description."""
 class VideoMeta(NamedTuple):
    """A video track metadata."""
@ -91,18 +94,10 @@ class Rendition(NamedTuple):
    source: tuple[str, Optional[str]]
 class Program(NamedTuple):
    """A program representation."""
    id: str
    slug: str
    meta: ProgramMeta
 class Sources(NamedTuple):
    """A program's sources."""
-    program: Program
+    program: ProgramMeta
    variants: list[Variant]
    renditions: list[Rendition]
--- a/src/delarte/naming.py
+++ b/src/delarte/naming.py
@ -5,7 +5,7 @@
 import re
 from typing import Optional
-from .model import Program, VideoMeta, AudioMeta, SubtitlesMeta
+from .model import VideoMeta, AudioMeta, SubtitlesMeta
 def file_name_builder(
@ -14,7 +14,6 @@ def file_name_builder(
    s_meta: Optional[SubtitlesMeta],
    *,
    use_id=False,
    use_slug=False,
    sep=" - ",
    seq_pfx=" - ",
    seq_no_pad=False,
@ -32,17 +31,14 @@ def file_name_builder(
    def replace_sequence_counter(s: str) -> str:
        return re.sub(r"\s+\((\d+)/(\d+)\)", sub_sequence_counter, s)
-    def build_file_name(program: Program) -> str:
+    def build_file_name(p_meta) -> str:
        """Create a file name for given program."""
        if use_id:
-            return program.id
+            return p_meta.id
-        if use_slug:
+        fields = [replace_sequence_counter(p_meta.title)]
-            return program.slug
+        if p_meta.subtitle:
-
+            fields.append(replace_sequence_counter(p_meta.subtitle))
        fields = [replace_sequence_counter(program.meta.title)]
        if program.meta.subtitle:
            fields.add(replace_sequence_counter(program.meta.subtitles))
        if add_resolution:
            fields.append(f"{v_meta.height}p")
--- a/src/delarte/www.py
+++ b/src/delarte/www.py
@ -3,28 +3,85 @@
 """Provide ArteTV website utilities."""
-from .error import InvalidUrl
+from contextlib import contextmanager
 import json
-BASE = "https://www.arte.tv/"
+from .error import InvalidPage, PageNotFound, PageNotSupported
-SITES = ["fr", "de", "en", "es", "pl", "it"]
+from .model import ProgramMeta
 _DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'
-def parse_url(url):
+@contextmanager
-    """Parse ArteTV web URL into target ID and web UI language."""
+def _schema_guard(*context):
-    if not url.startswith(BASE):
+    try:
-        raise InvalidUrl("BASE", url)
+        yield
    except (KeyError, IndexError, ValueError) as e:
        raise InvalidPage("SCHEMA", *context, e)
    path = url[len(BASE) :].split("/")
-    site = path.pop(0)
+def _process_programs_page(page_value):
-    if site not in SITES:
+    with _schema_guard():
-        raise InvalidUrl("SITE", url, site)
+        site = page_value["language"]
-    if (_ := path.pop(0)) != "videos":
+        content_zones = [
-        raise InvalidUrl("PATH", url, _)
+            zone
            for zone in page_value["zones"]
            if zone["code"].startswith("program_content_")
        ]
-    id = path.pop(0)
+        programs = [
-    slug = path.pop(0)
+            ProgramMeta(
                site, data_item["programId"], data_item["title"], data_item["subtitle"]
            )
            for zone in content_zones
            for data_item in zone["content"]["data"]
            if data_item["type"] == "program"
        ]
-    return site, id, slug
+    if len(content_zones) != 1:
        raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
    if len(programs) != 1:
        raise InvalidPage("PROGRAMS_PROGRAMS_COUNT")
    return programs[0]
 def fetch_program(http_session, url):
    """Load the ArteTV page at given URL and return list of programs on it."""
    r = http_session.get(url)
    # special handling of 404
    if r.status_code == 404:
        raise PageNotFound(url)
    # other network errors
    r.raise_for_status()
    # no HTML parsing required, whe just find the mark
    html = r.text
    start = html.find(_DATA_MARK)
    if start < 0:
        raise InvalidPage("DATA_MARK_NOT_FOUND", url)
    start += len(_DATA_MARK)
    end = html.index("</script>", start)
    try:
        next_js_data = json.loads(html[start:end].strip())
    except json.JSONDecodeError:
        raise InvalidPage("INVALID_JSON_DATA", url)
    with _schema_guard(url):
        initial_page_value = next_js_data["props"]["pageProps"]["initialPage"]["value"]
        initial_type = next_js_data["props"]["pageProps"]["initialType"]
    try:
        match initial_type:
            case "programs":
                return _process_programs_page(initial_page_value)
            case _:
                raise PageNotSupported("TYPE_NOT_SUPPORTED", url, initial_type)
    except InvalidPage as e:
        raise InvalidPage(e.args[0], url, *e.args[1:])