delarte/src/delarte/www.py

# License: GNU AGPL v3: http://www.gnu.org/licenses/
# This file is part of `delarte` (https://git.afpy.org/fcode/delarte.git)

"""Provide ArteTV website utilities."""

import json
from contextlib import contextmanager

from .error import InvalidPage, PageNotFound, PageNotSupported
from .model import ProgramMeta

_DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'


@contextmanager
def _schema_guard(*context):
    try:
        yield
    except (KeyError, IndexError, ValueError) as e:
        raise InvalidPage("SCHEMA", *context, e)


def _process_programs_page(page_value):

    with _schema_guard():
        site = page_value["language"]

        content_zones = [
            zone
            for zone in page_value["zones"]
            if zone["code"].startswith("program_content_")
        ]

        programs = [
            ProgramMeta(
                site, data_item["programId"], data_item["title"], data_item["subtitle"]
            )
            for zone in content_zones
            for data_item in zone["content"]["data"]
            if data_item["type"] == "program"
        ]

    if len(content_zones) != 1:
        raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")

    if len(programs) != 1:
        raise InvalidPage("PROGRAMS_PROGRAMS_COUNT")

    return programs[0]


def fetch_program(http_session, url):
    """Load the ArteTV page at given URL and return list of programs on it."""
    r = http_session.get(url)

    # special handling of 404
    if r.status_code == 404:
        raise PageNotFound(url)

    # other network errors
    r.raise_for_status()

    # no HTML parsing required, whe just find the mark
    html = r.text
    start = html.find(_DATA_MARK)
    if start < 0:
        raise InvalidPage("DATA_MARK_NOT_FOUND", url)
    start += len(_DATA_MARK)
    end = html.index("</script>", start)

    try:
        next_js_data = json.loads(html[start:end].strip())
    except json.JSONDecodeError:
        raise InvalidPage("INVALID_JSON_DATA", url)

    with _schema_guard(url):
        initial_page_value = next_js_data["props"]["pageProps"]["initialPage"]["value"]
        initial_type = next_js_data["props"]["pageProps"]["initialType"]

    try:
        match initial_type:
            case "programs":
                return _process_programs_page(initial_page_value)
            case _:
                raise PageNotSupported("TYPE_NOT_SUPPORTED", url, initial_type)
    except InvalidPage as e:
        raise InvalidPage(e.args[0], url, *e.args[1:])