Refactor www
module
Split functionalities in smaller parts - fetch the html code `fetch_page_content()` - extract JSON data from html code `extract_page_data()` - read the program info from page data `read_page_data()` Move that "pipeline" in `__init__.py`
This commit is contained in:
parent
23e2183c93
commit
bdc8b7b246
|
@ -9,16 +9,23 @@ from .error import *
|
|||
from .model import *
|
||||
|
||||
|
||||
def fetch_program_sources(url, http):
|
||||
def load_program_sources(http, page_url):
|
||||
"""Fetch program sources listed on given ArteTV page."""
|
||||
from .www import iter_programs
|
||||
from .www import read_page_data, fetch_page_content, extract_page_data
|
||||
|
||||
page_content = fetch_page_content(http, page_url)
|
||||
page_data = extract_page_data(page_content)
|
||||
programs = read_page_data(page_data)
|
||||
|
||||
if not programs:
|
||||
raise UnexpectedError("NO_PROGRAMS")
|
||||
|
||||
return [
|
||||
ProgramSource(
|
||||
program,
|
||||
player_config_url,
|
||||
Program(id, language, title, subtitle),
|
||||
f"https://api.arte.tv/api/player/v2/config/{language}/{id}",
|
||||
)
|
||||
for program, player_config_url in iter_programs(url, http)
|
||||
for id, language, title, subtitle in programs
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ from . import (
|
|||
HTTPError,
|
||||
__version__,
|
||||
download_targets,
|
||||
fetch_program_sources,
|
||||
load_program_sources,
|
||||
fetch_rendition_sources,
|
||||
fetch_targets,
|
||||
fetch_variant_sources,
|
||||
|
@ -140,7 +140,7 @@ def main():
|
|||
http = urllib3.PoolManager(timeout=5)
|
||||
|
||||
try:
|
||||
program_sources = fetch_program_sources(args["URL"], http)
|
||||
program_sources = load_program_sources(http, args["URL"])
|
||||
|
||||
rendition_sources = _select_rendition_sources(
|
||||
args["RENDITION"],
|
||||
|
|
|
@ -48,6 +48,10 @@ class InvalidPage(UnexpectedError):
|
|||
"""Invalid ArteTV page."""
|
||||
|
||||
|
||||
class InvalidPageData(UnexpectedError):
|
||||
"""Invalid ArteTV page data."""
|
||||
|
||||
|
||||
#
|
||||
# api
|
||||
#
|
||||
|
|
|
@ -5,130 +5,123 @@
|
|||
|
||||
import json
|
||||
|
||||
from .error import InvalidPage, PageNotFound, PageNotSupported, HTTPError
|
||||
from .model import Program
|
||||
from .error import (
|
||||
HTTPError,
|
||||
InvalidPage,
|
||||
InvalidPageData,
|
||||
PageNotFound,
|
||||
PageNotSupported,
|
||||
)
|
||||
|
||||
_DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'
|
||||
|
||||
|
||||
def _process_programs_page(page_value):
|
||||
language = page_value["language"]
|
||||
|
||||
zone_found = False
|
||||
program_found = False
|
||||
|
||||
for zone in page_value["zones"]:
|
||||
if zone["code"].startswith("program_content_"):
|
||||
if zone_found:
|
||||
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
|
||||
zone_found = True
|
||||
else:
|
||||
continue
|
||||
|
||||
for data_item in zone["content"]["data"]:
|
||||
if data_item["type"] == "program":
|
||||
if program_found:
|
||||
raise InvalidPage("PROGRAMS_CONTENT_PROGRAM_COUNT")
|
||||
program_found = True
|
||||
else:
|
||||
raise InvalidPage("PROGRAMS_CONTENT_PROGRAM_TYPE")
|
||||
|
||||
yield (
|
||||
Program(
|
||||
data_item["programId"],
|
||||
language,
|
||||
data_item["title"],
|
||||
data_item["subtitle"],
|
||||
),
|
||||
data_item["player"]["config"],
|
||||
)
|
||||
|
||||
if not zone_found:
|
||||
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
|
||||
|
||||
if not program_found:
|
||||
raise InvalidPage("PROGRAMS_CONTENT_PROGRAM_COUNT")
|
||||
|
||||
|
||||
def _process_collections_page(page_value):
|
||||
language = page_value["language"]
|
||||
|
||||
main_zone_found = False
|
||||
sub_zone_found = False
|
||||
program_found = False
|
||||
|
||||
for zone in page_value["zones"]:
|
||||
if zone["code"].startswith("collection_videos_"):
|
||||
if main_zone_found:
|
||||
raise InvalidPage("COLLECTIONS_MAIN_ZONE_COUNT")
|
||||
if program_found:
|
||||
raise InvalidPage("COLLECTIONS_MIXED_ZONES")
|
||||
main_zone_found = True
|
||||
elif zone["code"].startswith("collection_subcollection_"):
|
||||
if program_found and not sub_zone_found:
|
||||
raise InvalidPage("COLLECTIONS_MIXED_ZONES")
|
||||
sub_zone_found = True
|
||||
else:
|
||||
continue
|
||||
|
||||
for data_item in zone["content"]["data"]:
|
||||
if (_ := data_item["type"]) == "teaser":
|
||||
program_found = True
|
||||
else:
|
||||
raise InvalidPage("COLLECTIONS_INVALID_CONTENT_DATA_ITEM", _)
|
||||
|
||||
yield (
|
||||
Program(
|
||||
data_item["programId"],
|
||||
language,
|
||||
data_item["title"],
|
||||
data_item["subtitle"],
|
||||
),
|
||||
f"https://api.arte.tv/api/player/v2/config/{language}/{data_item['programId']}",
|
||||
)
|
||||
|
||||
if not main_zone_found:
|
||||
raise InvalidPage("COLLECTIONS_MAIN_ZONE_COUNT")
|
||||
|
||||
if not program_found:
|
||||
raise InvalidPage("COLLECTIONS_PROGRAMS_COUNT")
|
||||
|
||||
|
||||
def iter_programs(page_url, http):
|
||||
"""Iterate over programs listed on given ArteTV page."""
|
||||
def fetch_page_content(http, page_url):
|
||||
"""Fetch html content at given URL."""
|
||||
r = http.request("GET", page_url)
|
||||
|
||||
# special handling of 404
|
||||
if r.status == 404:
|
||||
raise PageNotFound(page_url)
|
||||
raise PageNotFound()
|
||||
HTTPError.raise_for_status(r)
|
||||
|
||||
# no HTML parsing required, whe just find the mark
|
||||
html = r.data.decode("utf-8")
|
||||
start = html.find(_DATA_MARK)
|
||||
return r.data.decode("utf-8")
|
||||
|
||||
|
||||
def extract_page_data(html_content):
|
||||
"""Extract JSON page data from html content."""
|
||||
start = html_content.find(_DATA_MARK)
|
||||
if start < 0:
|
||||
raise InvalidPage("DATA_MARK_NOT_FOUND", page_url)
|
||||
raise InvalidPage("DATA_MARK_NOT_FOUND")
|
||||
start += len(_DATA_MARK)
|
||||
end = html.index("</script>", start)
|
||||
end = html_content.index("</script>", start)
|
||||
|
||||
try:
|
||||
next_js_data = json.loads(html[start:end].strip())
|
||||
return json.loads(html_content[start:end].strip())
|
||||
except json.JSONDecodeError:
|
||||
raise InvalidPage("INVALID_JSON_DATA", page_url)
|
||||
raise InvalidPage("INVALID_JSON_DATA")
|
||||
|
||||
|
||||
def _find_zones(page_value, code_prefix):
|
||||
return [
|
||||
zone for zone in page_value["zones"] if zone["code"].startswith(code_prefix)
|
||||
]
|
||||
|
||||
|
||||
def _find_unique_zone(page_value, code_prefix):
|
||||
zones = _find_zones(page_value, code_prefix)
|
||||
if len(zones) != 1:
|
||||
raise InvalidPageData("ZONE_COUNT")
|
||||
return zones[0]
|
||||
|
||||
|
||||
def _find_items(zone, item_type):
|
||||
return [item for item in zone["content"]["data"] if item["type"] == item_type]
|
||||
|
||||
|
||||
def _find_unique_item(zone, item_type):
|
||||
items = _find_items(zone, item_type)
|
||||
if len(items) != 1:
|
||||
raise InvalidPageData("ITEM_COUNT")
|
||||
return items[0]
|
||||
|
||||
|
||||
def _read_program_page(page_value):
|
||||
language = page_value["language"]
|
||||
|
||||
zone = _find_unique_zone(page_value, "program_content_")
|
||||
item = _find_unique_item(zone, "program")
|
||||
return (
|
||||
item["programId"],
|
||||
language,
|
||||
item["title"],
|
||||
item["subtitle"],
|
||||
)
|
||||
|
||||
|
||||
def _read_collection_page(page_value):
|
||||
language = page_value["language"]
|
||||
|
||||
main_zone = _find_unique_zone(page_value, "collection_videos_")
|
||||
main_items = _find_items(main_zone, "teaser")
|
||||
|
||||
sub_zones = _find_zones(page_value, "collection_subcollection_")
|
||||
|
||||
if sub_zones:
|
||||
if main_items:
|
||||
raise InvalidPageData("MIXED_ZONES")
|
||||
items = [
|
||||
item for sub_zone in sub_zones for item in _find_items(sub_zone, "teaser")
|
||||
]
|
||||
elif main_items:
|
||||
items = main_items
|
||||
else:
|
||||
return []
|
||||
|
||||
return [
|
||||
(
|
||||
item["programId"],
|
||||
language,
|
||||
item["title"],
|
||||
item["subtitle"],
|
||||
)
|
||||
for item in items
|
||||
]
|
||||
|
||||
|
||||
def read_page_data(page_data):
|
||||
"""Return programs listed on given JSON page data."""
|
||||
|
||||
try:
|
||||
page_value = next_js_data["props"]["pageProps"]["props"]["page"]["value"]
|
||||
page_value = page_data["props"]["pageProps"]["props"]["page"]["value"]
|
||||
|
||||
match page_value["type"]:
|
||||
case "program":
|
||||
yield from _process_programs_page(page_value)
|
||||
return [_read_program_page(page_value)]
|
||||
case "collection":
|
||||
yield from _process_collections_page(page_value)
|
||||
return _read_collection_page(page_value)
|
||||
case _:
|
||||
raise PageNotSupported(page_url, page_value)
|
||||
raise PageNotSupported()
|
||||
|
||||
except (KeyError, IndexError, ValueError) as e:
|
||||
raise InvalidPage("SCHEMA", page_url) from e
|
||||
|
||||
except InvalidPage as e:
|
||||
raise InvalidPage(e.args[0], page_url) from e
|
||||
raise InvalidPage("SCHEMA") from e
|
||||
|
|
Loading…
Reference in New Issue
Block a user