Get program information from page content

Changes the way the program information is figured out. From URL parsing to page content parsing. A massive JSON object is shipped within the HTML of the page, that's were we get what we need from. Side effects: - drop `slug` from the program's info - drop `slug` naming option - no `Program` / `ProgramMeta` distinction Includes some JSON samples.
2023-01-14 19:51:02 +01:00 · 2023-01-14 19:51:02 +01:00 · 639a8063a5
parent ba2dd96b36
commit 639a8063a5
9 changed files with 6532 additions and 63 deletions
--- a/samples/www/103011-000-A__l-assaut-du-capitole.json
+++ b/samples/www/103011-000-A__l-assaut-du-capitole.json
--- a/samples/www/109041-002-A__acquitted-saison-1-2-10.json
+++ b/samples/www/109041-002-A__acquitted-saison-1-2-10.json
--- a/src/delarte/init.py
+++ b/src/delarte/init.py
@ -13,16 +13,14 @@ def fetch_sources(http_session, url):
    """Fetch sources at a given ArteTV page URL."""
    from .api import fetch_program_info
    from .hls import fetch_program_tracks
-    from .www import parse_url
+    from .www import fetch_program

-    site, program_id, slug = parse_url(url)
+    p_meta = fetch_program(http_session, url)

    variants = dict()
    renditions = dict()

-    p_meta, program_index_urls = fetch_program_info(http_session, site, program_id)
-
-    program = Program(program_id, slug, p_meta)
+    program_index_urls = fetch_program_info(http_session, p_meta)

    for program_index_url in program_index_urls:
        v_tracks, a_track, s_track = fetch_program_tracks(
@ -43,7 +41,7 @@ def fetch_sources(http_session, url):
            raise ValueError

    return Sources(
-        program,
+        p_meta,
        [Variant(key, source) for key, source in variants.items()],
        [Rendition(key, source) for key, source in renditions.items()],
    )
@ -146,7 +144,7 @@ def compile_sources(sources, **naming_options):
    build_file_name = file_name_builder(v_meta, a_meta, s_meta, **naming_options)

    return Target(
-        sources.program.meta,
+        sources.program,
        VideoTrack(v_meta, v_url),
        AudioTrack(a_meta, a_url),
        SubtitlesTrack(s_meta, s_url) if s_meta else None,
--- a/src/delarte/main.py
+++ b/src/delarte/main.py
@ -23,7 +23,6 @@ Options:
  --version              print current version of the program
  --debug                on error, print debugging information
  --name-use-id          use the program ID
-  --name-use-slug        use the URL slug
  --name-sep=<sep>       field separator [default:  - ]
  --name-seq-pfx=<pfx>   sequence counter prefix [default:  - ]
  --name-seq-no-pad      disable sequence zero-padding
--- a/src/delarte/api.py
+++ b/src/delarte/api.py
@ -27,38 +27,34 @@ def _fetch_api_data(http_session, path, object_type):
    return obj["attributes"]


-def fetch_program_info(http_session, site, program_id):
+def fetch_program_info(http_session, p_meta):
    """Fetch the given program metadata and indexes."""
-    obj = _fetch_api_data(http_session, f"config/{site}/{program_id}", "ConfigPlayer")
+    obj = _fetch_api_data(
+        http_session, f"config/{p_meta.site}/{p_meta.id}", "ConfigPlayer"
+    )

-    if (_ := obj["metadata"]["providerId"]) != program_id:
+    if (_ := obj["metadata"]["providerId"]) != p_meta.id:
        raise UnexpectedAPIResponse(
            "PROGRAM_ID_MISMATCH",
-            site,
-            program_id,
+            p_meta.site,
+            p_meta.id,
            _,
        )

-    program_meta = ProgramMeta(
-        obj["metadata"]["title"],
-        obj["metadata"]["subtitle"],
-        obj["metadata"]["description"],
-    )
-
    program_index_urls = set()

    for s in obj["streams"]:
        if (_ := s["protocol"]) != "HLS_NG":
-            raise UnsupportedHLSProtocol(site, program_id, _)
+            raise UnsupportedHLSProtocol(p_meta.site, p_meta.id, _)

        if (program_index_url := s["url"]) in program_index_urls:
            raise UnexpectedAPIResponse(
                "DUPLICATE_PROGRAM_INDEX_URL",
-                site,
-                program_id,
+                p_meta.site,
+                p_meta.id,
                program_index_url,
            )

        program_index_urls.add(program_index_url)

-    return program_meta, program_index_urls
+    return program_index_urls
--- a/src/delarte/error.py
+++ b/src/delarte/error.py
@ -16,14 +16,32 @@ class ModuleError(Exception):
        return f"{self.__class__}{self.args!r}"


+class ExpectedError(ModuleError):
+    """A feature limitation to submit as an enhancement to developers."""
+
+
 class UnexpectedError(ModuleError):
    """An error to report to developers."""


-class InvalidUrl(ModuleError):
-    """Invalid ArteTV URL."""
+#
+# www
+#
+class PageNotFound(ModuleError):
+    """Page not found at ArteTV."""


+class PageNotSupported(ExpectedError):
+    """The page you are trying to download from is not (yet) supported."""
+
+
+class InvalidPage(UnexpectedError):
+    """Invalid ArteTV page."""
+
+
+#
+# Others
+#
 class UnexpectedAPIResponse(UnexpectedError):
    """Unexpected response from ArteTV."""

--- a/src/delarte/model.py
+++ b/src/delarte/model.py
@ -10,15 +10,18 @@ from typing import NamedTuple, Optional
 class ProgramMeta(NamedTuple):
    """A program metadata."""

+    site: str
+    """The site where it is hosted (fr, de, etc...)."""
+
+    id: str
+    """The ID."""
+
    title: str
    """The title."""

    subtitle: str
    """The subtitle or secondary title."""

-    description: str
-    """The description."""
-

 class VideoMeta(NamedTuple):
    """A video track metadata."""
@ -91,18 +94,10 @@ class Rendition(NamedTuple):
    source: tuple[str, Optional[str]]


-class Program(NamedTuple):
-    """A program representation."""
-
-    id: str
-    slug: str
-    meta: ProgramMeta
-
-
 class Sources(NamedTuple):
    """A program's sources."""

-    program: Program
+    program: ProgramMeta
    variants: list[Variant]
    renditions: list[Rendition]

--- a/src/delarte/naming.py
+++ b/src/delarte/naming.py
@ -5,7 +5,7 @@
 import re

 from typing import Optional
-from .model import Program, VideoMeta, AudioMeta, SubtitlesMeta
+from .model import VideoMeta, AudioMeta, SubtitlesMeta


 def file_name_builder(
@ -14,7 +14,6 @@ def file_name_builder(
    s_meta: Optional[SubtitlesMeta],
    *,
    use_id=False,
-    use_slug=False,
    sep=" - ",
    seq_pfx=" - ",
    seq_no_pad=False,
@ -32,17 +31,14 @@ def file_name_builder(
    def replace_sequence_counter(s: str) -> str:
        return re.sub(r"\s+\((\d+)/(\d+)\)", sub_sequence_counter, s)

-    def build_file_name(program: Program) -> str:
+    def build_file_name(p_meta) -> str:
        """Create a file name for given program."""
        if use_id:
-            return program.id
+            return p_meta.id

-        if use_slug:
-            return program.slug
-
-        fields = [replace_sequence_counter(program.meta.title)]
-        if program.meta.subtitle:
-            fields.add(replace_sequence_counter(program.meta.subtitles))
+        fields = [replace_sequence_counter(p_meta.title)]
+        if p_meta.subtitle:
+            fields.append(replace_sequence_counter(p_meta.subtitle))

        if add_resolution:
            fields.append(f"{v_meta.height}p")
--- a/src/delarte/www.py
+++ b/src/delarte/www.py
@ -3,28 +3,85 @@

 """Provide ArteTV website utilities."""

-from .error import InvalidUrl
+from contextlib import contextmanager
+import json

-BASE = "https://www.arte.tv/"
-SITES = ["fr", "de", "en", "es", "pl", "it"]
+from .error import InvalidPage, PageNotFound, PageNotSupported
+from .model import ProgramMeta
+
+_DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'


-def parse_url(url):
-    """Parse ArteTV web URL into target ID and web UI language."""
-    if not url.startswith(BASE):
-        raise InvalidUrl("BASE", url)
+@contextmanager
+def _schema_guard(*context):
+    try:
+        yield
+    except (KeyError, IndexError, ValueError) as e:
+        raise InvalidPage("SCHEMA", *context, e)

-    path = url[len(BASE) :].split("/")

-    site = path.pop(0)
+def _process_programs_page(page_value):

-    if site not in SITES:
-        raise InvalidUrl("SITE", url, site)
+    with _schema_guard():
+        site = page_value["language"]

-    if (_ := path.pop(0)) != "videos":
-        raise InvalidUrl("PATH", url, _)
+        content_zones = [
+            zone
+            for zone in page_value["zones"]
+            if zone["code"].startswith("program_content_")
+        ]

-    id = path.pop(0)
-    slug = path.pop(0)
+        programs = [
+            ProgramMeta(
+                site, data_item["programId"], data_item["title"], data_item["subtitle"]
+            )
+            for zone in content_zones
+            for data_item in zone["content"]["data"]
+            if data_item["type"] == "program"
+        ]

-    return site, id, slug
+    if len(content_zones) != 1:
+        raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
+
+    if len(programs) != 1:
+        raise InvalidPage("PROGRAMS_PROGRAMS_COUNT")
+
+    return programs[0]
+
+
+def fetch_program(http_session, url):
+    """Load the ArteTV page at given URL and return list of programs on it."""
+    r = http_session.get(url)
+
+    # special handling of 404
+    if r.status_code == 404:
+        raise PageNotFound(url)
+
+    # other network errors
+    r.raise_for_status()
+
+    # no HTML parsing required, whe just find the mark
+    html = r.text
+    start = html.find(_DATA_MARK)
+    if start < 0:
+        raise InvalidPage("DATA_MARK_NOT_FOUND", url)
+    start += len(_DATA_MARK)
+    end = html.index("</script>", start)
+
+    try:
+        next_js_data = json.loads(html[start:end].strip())
+    except json.JSONDecodeError:
+        raise InvalidPage("INVALID_JSON_DATA", url)
+
+    with _schema_guard(url):
+        initial_page_value = next_js_data["props"]["pageProps"]["initialPage"]["value"]
+        initial_type = next_js_data["props"]["pageProps"]["initialType"]
+
+    try:
+        match initial_type:
+            case "programs":
+                return _process_programs_page(initial_page_value)
+            case _:
+                raise PageNotSupported("TYPE_NOT_SUPPORTED", url, initial_type)
+    except InvalidPage as e:
+        raise InvalidPage(e.args[0], url, *e.args[1:])