delarte/src/delarte/www.py

88 lines
2.5 KiB
Python

# License: GNU AGPL v3: http://www.gnu.org/licenses/
# This file is part of `delarte` (https://git.afpy.org/fcode/delarte.git)
"""Provide ArteTV website utilities."""
import json
from contextlib import contextmanager
from .error import InvalidPage, PageNotFound, PageNotSupported
from .model import ProgramMeta
_DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'
@contextmanager
def _schema_guard(*context):
try:
yield
except (KeyError, IndexError, ValueError) as e:
raise InvalidPage("SCHEMA", *context, e)
def _process_programs_page(page_value):
with _schema_guard():
site = page_value["language"]
content_zones = [
zone
for zone in page_value["zones"]
if zone["code"].startswith("program_content_")
]
programs = [
ProgramMeta(
site, data_item["programId"], data_item["title"], data_item["subtitle"]
)
for zone in content_zones
for data_item in zone["content"]["data"]
if data_item["type"] == "program"
]
if len(content_zones) != 1:
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
if len(programs) != 1:
raise InvalidPage("PROGRAMS_PROGRAMS_COUNT")
return programs[0]
def fetch_program(http_session, url):
"""Load the ArteTV page at given URL and return list of programs on it."""
r = http_session.get(url)
# special handling of 404
if r.status_code == 404:
raise PageNotFound(url)
# other network errors
r.raise_for_status()
# no HTML parsing required, whe just find the mark
html = r.text
start = html.find(_DATA_MARK)
if start < 0:
raise InvalidPage("DATA_MARK_NOT_FOUND", url)
start += len(_DATA_MARK)
end = html.index("</script>", start)
try:
next_js_data = json.loads(html[start:end].strip())
except json.JSONDecodeError:
raise InvalidPage("INVALID_JSON_DATA", url)
with _schema_guard(url):
initial_page_value = next_js_data["props"]["pageProps"]["initialPage"]["value"]
initial_type = next_js_data["props"]["pageProps"]["initialType"]
try:
match initial_type:
case "programs":
return _process_programs_page(initial_page_value)
case _:
raise PageNotSupported("TYPE_NOT_SUPPORTED", url, initial_type)
except InvalidPage as e:
raise InvalidPage(e.args[0], url, *e.args[1:])