Refactor www module

Split functionalities in smaller parts
- fetch the html code `fetch_page_content()`
- extract JSON data from html code `extract_page_data()`
- read the program info from page data `read_page_data()`
Move that "pipeline" in `__init__.py`
This commit is contained in:
Barbagus 2023-02-16 08:18:02 +01:00
parent 23e2183c93
commit bdc8b7b246
4 changed files with 114 additions and 110 deletions

View File

@ -9,16 +9,23 @@ from .error import *
from .model import *
def fetch_program_sources(url, http):
def load_program_sources(http, page_url):
"""Fetch program sources listed on given ArteTV page."""
from .www import iter_programs
from .www import read_page_data, fetch_page_content, extract_page_data
page_content = fetch_page_content(http, page_url)
page_data = extract_page_data(page_content)
programs = read_page_data(page_data)
if not programs:
raise UnexpectedError("NO_PROGRAMS")
return [
ProgramSource(
program,
player_config_url,
Program(id, language, title, subtitle),
f"https://api.arte.tv/api/player/v2/config/{language}/{id}",
)
for program, player_config_url in iter_programs(url, http)
for id, language, title, subtitle in programs
]

View File

@ -43,7 +43,7 @@ from . import (
HTTPError,
__version__,
download_targets,
fetch_program_sources,
load_program_sources,
fetch_rendition_sources,
fetch_targets,
fetch_variant_sources,
@ -140,7 +140,7 @@ def main():
http = urllib3.PoolManager(timeout=5)
try:
program_sources = fetch_program_sources(args["URL"], http)
program_sources = load_program_sources(http, args["URL"])
rendition_sources = _select_rendition_sources(
args["RENDITION"],

View File

@ -48,6 +48,10 @@ class InvalidPage(UnexpectedError):
"""Invalid ArteTV page."""
class InvalidPageData(UnexpectedError):
"""Invalid ArteTV page data."""
#
# api
#

View File

@ -5,130 +5,123 @@
import json
from .error import InvalidPage, PageNotFound, PageNotSupported, HTTPError
from .model import Program
from .error import (
HTTPError,
InvalidPage,
InvalidPageData,
PageNotFound,
PageNotSupported,
)
_DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'
def _process_programs_page(page_value):
language = page_value["language"]
zone_found = False
program_found = False
for zone in page_value["zones"]:
if zone["code"].startswith("program_content_"):
if zone_found:
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
zone_found = True
else:
continue
for data_item in zone["content"]["data"]:
if data_item["type"] == "program":
if program_found:
raise InvalidPage("PROGRAMS_CONTENT_PROGRAM_COUNT")
program_found = True
else:
raise InvalidPage("PROGRAMS_CONTENT_PROGRAM_TYPE")
yield (
Program(
data_item["programId"],
language,
data_item["title"],
data_item["subtitle"],
),
data_item["player"]["config"],
)
if not zone_found:
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
if not program_found:
raise InvalidPage("PROGRAMS_CONTENT_PROGRAM_COUNT")
def _process_collections_page(page_value):
language = page_value["language"]
main_zone_found = False
sub_zone_found = False
program_found = False
for zone in page_value["zones"]:
if zone["code"].startswith("collection_videos_"):
if main_zone_found:
raise InvalidPage("COLLECTIONS_MAIN_ZONE_COUNT")
if program_found:
raise InvalidPage("COLLECTIONS_MIXED_ZONES")
main_zone_found = True
elif zone["code"].startswith("collection_subcollection_"):
if program_found and not sub_zone_found:
raise InvalidPage("COLLECTIONS_MIXED_ZONES")
sub_zone_found = True
else:
continue
for data_item in zone["content"]["data"]:
if (_ := data_item["type"]) == "teaser":
program_found = True
else:
raise InvalidPage("COLLECTIONS_INVALID_CONTENT_DATA_ITEM", _)
yield (
Program(
data_item["programId"],
language,
data_item["title"],
data_item["subtitle"],
),
f"https://api.arte.tv/api/player/v2/config/{language}/{data_item['programId']}",
)
if not main_zone_found:
raise InvalidPage("COLLECTIONS_MAIN_ZONE_COUNT")
if not program_found:
raise InvalidPage("COLLECTIONS_PROGRAMS_COUNT")
def iter_programs(page_url, http):
"""Iterate over programs listed on given ArteTV page."""
def fetch_page_content(http, page_url):
"""Fetch html content at given URL."""
r = http.request("GET", page_url)
# special handling of 404
if r.status == 404:
raise PageNotFound(page_url)
raise PageNotFound()
HTTPError.raise_for_status(r)
# no HTML parsing required, whe just find the mark
html = r.data.decode("utf-8")
start = html.find(_DATA_MARK)
return r.data.decode("utf-8")
def extract_page_data(html_content):
"""Extract JSON page data from html content."""
start = html_content.find(_DATA_MARK)
if start < 0:
raise InvalidPage("DATA_MARK_NOT_FOUND", page_url)
raise InvalidPage("DATA_MARK_NOT_FOUND")
start += len(_DATA_MARK)
end = html.index("</script>", start)
end = html_content.index("</script>", start)
try:
next_js_data = json.loads(html[start:end].strip())
return json.loads(html_content[start:end].strip())
except json.JSONDecodeError:
raise InvalidPage("INVALID_JSON_DATA", page_url)
raise InvalidPage("INVALID_JSON_DATA")
def _find_zones(page_value, code_prefix):
return [
zone for zone in page_value["zones"] if zone["code"].startswith(code_prefix)
]
def _find_unique_zone(page_value, code_prefix):
zones = _find_zones(page_value, code_prefix)
if len(zones) != 1:
raise InvalidPageData("ZONE_COUNT")
return zones[0]
def _find_items(zone, item_type):
return [item for item in zone["content"]["data"] if item["type"] == item_type]
def _find_unique_item(zone, item_type):
items = _find_items(zone, item_type)
if len(items) != 1:
raise InvalidPageData("ITEM_COUNT")
return items[0]
def _read_program_page(page_value):
language = page_value["language"]
zone = _find_unique_zone(page_value, "program_content_")
item = _find_unique_item(zone, "program")
return (
item["programId"],
language,
item["title"],
item["subtitle"],
)
def _read_collection_page(page_value):
language = page_value["language"]
main_zone = _find_unique_zone(page_value, "collection_videos_")
main_items = _find_items(main_zone, "teaser")
sub_zones = _find_zones(page_value, "collection_subcollection_")
if sub_zones:
if main_items:
raise InvalidPageData("MIXED_ZONES")
items = [
item for sub_zone in sub_zones for item in _find_items(sub_zone, "teaser")
]
elif main_items:
items = main_items
else:
return []
return [
(
item["programId"],
language,
item["title"],
item["subtitle"],
)
for item in items
]
def read_page_data(page_data):
"""Return programs listed on given JSON page data."""
try:
page_value = next_js_data["props"]["pageProps"]["props"]["page"]["value"]
page_value = page_data["props"]["pageProps"]["props"]["page"]["value"]
match page_value["type"]:
case "program":
yield from _process_programs_page(page_value)
return [_read_program_page(page_value)]
case "collection":
yield from _process_collections_page(page_value)
return _read_collection_page(page_value)
case _:
raise PageNotSupported(page_url, page_value)
raise PageNotSupported()
except (KeyError, IndexError, ValueError) as e:
raise InvalidPage("SCHEMA", page_url) from e
except InvalidPage as e:
raise InvalidPage(e.args[0], page_url) from e
raise InvalidPage("SCHEMA") from e