delarte/src/delarte/www.py

135 lines
4.1 KiB
Python

# License: GNU AGPL v3: http://www.gnu.org/licenses/
# This file is part of `delarte` (https://git.afpy.org/fcode/delarte.git)
"""Provide ArteTV website utilities."""
import json
from .error import InvalidPage, PageNotFound, PageNotSupported
from .model import Program
_DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'
def _process_programs_page(page_value):
language = page_value["language"]
zone_found = False
program_found = False
for zone in page_value["zones"]:
if zone["code"].startswith("program_content_"):
if zone_found:
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
zone_found = True
else:
continue
for data_item in zone["content"]["data"]:
if data_item["type"] == "program":
if program_found:
raise InvalidPage("PROGRAMS_CONTENT_PROGRAM_COUNT")
program_found = True
else:
raise InvalidPage("PROGRAMS_CONTENT_PROGRAM_TYPE")
yield (
Program(
data_item["programId"],
language,
data_item["title"],
data_item["subtitle"],
),
data_item["player"]["config"],
)
if not zone_found:
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
if not program_found:
raise InvalidPage("PROGRAMS_CONTENT_PROGRAM_COUNT")
def _process_collections_page(page_value):
language = page_value["language"]
main_zone_found = False
sub_zone_found = False
program_found = False
for zone in page_value["zones"]:
if zone["code"].startswith("collection_videos_"):
if main_zone_found:
raise InvalidPage("COLLECTIONS_MAIN_ZONE_COUNT")
if program_found:
raise InvalidPage("COLLECTIONS_MIXED_ZONES")
main_zone_found = True
elif zone["code"].startswith("collection_subcollection_"):
if program_found and not sub_zone_found:
raise InvalidPage("COLLECTIONS_MIXED_ZONES")
sub_zone_found = True
else:
continue
for data_item in zone["content"]["data"]:
if (_ := data_item["type"]) == "teaser":
program_found = True
else:
raise InvalidPage("COLLECTIONS_INVALID_CONTENT_DATA_ITEM", _)
yield (
Program(
data_item["programId"],
language,
data_item["title"],
data_item["subtitle"],
),
f"https://api.arte.tv/api/player/v2/config/{language}/{data_item['programId']}",
)
if not main_zone_found:
raise InvalidPage("COLLECTIONS_MAIN_ZONE_COUNT")
if not program_found:
raise InvalidPage("COLLECTIONS_PROGRAMS_COUNT")
def iter_programs(page_url, http_session):
"""Iterate over programs listed on given ArteTV page."""
r = http_session.get(page_url)
# special handling of 404
if r.status_code == 404:
raise PageNotFound(page_url)
r.raise_for_status()
# no HTML parsing required, whe just find the mark
html = r.text
start = html.find(_DATA_MARK)
if start < 0:
raise InvalidPage("DATA_MARK_NOT_FOUND", page_url)
start += len(_DATA_MARK)
end = html.index("</script>", start)
try:
next_js_data = json.loads(html[start:end].strip())
except json.JSONDecodeError:
raise InvalidPage("INVALID_JSON_DATA", page_url)
try:
page_value = next_js_data["props"]["pageProps"]["props"]["page"]["value"]
match page_value["type"]:
case "program":
yield from _process_programs_page(page_value)
case "collection":
yield from _process_collections_page(page_value)
case _:
raise PageNotSupported(page_url, page_value)
except (KeyError, IndexError, ValueError) as e:
raise InvalidPage("SCHEMA", page_url) from e
except InvalidPage as e:
raise InvalidPage(e.args[0], page_url) from e