Get program information from page content
Changes the way the program information is figured out. From URL parsing to page content parsing. A massive JSON object is shipped within the HTML of the page, that's were we get what we need from. Side effects: - drop `slug` from the program's info - drop `slug` naming option - no `Program` / `ProgramMeta` distinction Includes some JSON samples.
This commit is contained in:
parent
ba2dd96b36
commit
639a8063a5
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -13,16 +13,14 @@ def fetch_sources(http_session, url):
|
|||
"""Fetch sources at a given ArteTV page URL."""
|
||||
from .api import fetch_program_info
|
||||
from .hls import fetch_program_tracks
|
||||
from .www import parse_url
|
||||
from .www import fetch_program
|
||||
|
||||
site, program_id, slug = parse_url(url)
|
||||
p_meta = fetch_program(http_session, url)
|
||||
|
||||
variants = dict()
|
||||
renditions = dict()
|
||||
|
||||
p_meta, program_index_urls = fetch_program_info(http_session, site, program_id)
|
||||
|
||||
program = Program(program_id, slug, p_meta)
|
||||
program_index_urls = fetch_program_info(http_session, p_meta)
|
||||
|
||||
for program_index_url in program_index_urls:
|
||||
v_tracks, a_track, s_track = fetch_program_tracks(
|
||||
|
@ -43,7 +41,7 @@ def fetch_sources(http_session, url):
|
|||
raise ValueError
|
||||
|
||||
return Sources(
|
||||
program,
|
||||
p_meta,
|
||||
[Variant(key, source) for key, source in variants.items()],
|
||||
[Rendition(key, source) for key, source in renditions.items()],
|
||||
)
|
||||
|
@ -146,7 +144,7 @@ def compile_sources(sources, **naming_options):
|
|||
build_file_name = file_name_builder(v_meta, a_meta, s_meta, **naming_options)
|
||||
|
||||
return Target(
|
||||
sources.program.meta,
|
||||
sources.program,
|
||||
VideoTrack(v_meta, v_url),
|
||||
AudioTrack(a_meta, a_url),
|
||||
SubtitlesTrack(s_meta, s_url) if s_meta else None,
|
||||
|
|
|
@ -23,7 +23,6 @@ Options:
|
|||
--version print current version of the program
|
||||
--debug on error, print debugging information
|
||||
--name-use-id use the program ID
|
||||
--name-use-slug use the URL slug
|
||||
--name-sep=<sep> field separator [default: - ]
|
||||
--name-seq-pfx=<pfx> sequence counter prefix [default: - ]
|
||||
--name-seq-no-pad disable sequence zero-padding
|
||||
|
|
|
@ -27,38 +27,34 @@ def _fetch_api_data(http_session, path, object_type):
|
|||
return obj["attributes"]
|
||||
|
||||
|
||||
def fetch_program_info(http_session, site, program_id):
|
||||
def fetch_program_info(http_session, p_meta):
|
||||
"""Fetch the given program metadata and indexes."""
|
||||
obj = _fetch_api_data(http_session, f"config/{site}/{program_id}", "ConfigPlayer")
|
||||
obj = _fetch_api_data(
|
||||
http_session, f"config/{p_meta.site}/{p_meta.id}", "ConfigPlayer"
|
||||
)
|
||||
|
||||
if (_ := obj["metadata"]["providerId"]) != program_id:
|
||||
if (_ := obj["metadata"]["providerId"]) != p_meta.id:
|
||||
raise UnexpectedAPIResponse(
|
||||
"PROGRAM_ID_MISMATCH",
|
||||
site,
|
||||
program_id,
|
||||
p_meta.site,
|
||||
p_meta.id,
|
||||
_,
|
||||
)
|
||||
|
||||
program_meta = ProgramMeta(
|
||||
obj["metadata"]["title"],
|
||||
obj["metadata"]["subtitle"],
|
||||
obj["metadata"]["description"],
|
||||
)
|
||||
|
||||
program_index_urls = set()
|
||||
|
||||
for s in obj["streams"]:
|
||||
if (_ := s["protocol"]) != "HLS_NG":
|
||||
raise UnsupportedHLSProtocol(site, program_id, _)
|
||||
raise UnsupportedHLSProtocol(p_meta.site, p_meta.id, _)
|
||||
|
||||
if (program_index_url := s["url"]) in program_index_urls:
|
||||
raise UnexpectedAPIResponse(
|
||||
"DUPLICATE_PROGRAM_INDEX_URL",
|
||||
site,
|
||||
program_id,
|
||||
p_meta.site,
|
||||
p_meta.id,
|
||||
program_index_url,
|
||||
)
|
||||
|
||||
program_index_urls.add(program_index_url)
|
||||
|
||||
return program_meta, program_index_urls
|
||||
return program_index_urls
|
||||
|
|
|
@ -16,14 +16,32 @@ class ModuleError(Exception):
|
|||
return f"{self.__class__}{self.args!r}"
|
||||
|
||||
|
||||
class ExpectedError(ModuleError):
|
||||
"""A feature limitation to submit as an enhancement to developers."""
|
||||
|
||||
|
||||
class UnexpectedError(ModuleError):
|
||||
"""An error to report to developers."""
|
||||
|
||||
|
||||
class InvalidUrl(ModuleError):
|
||||
"""Invalid ArteTV URL."""
|
||||
#
|
||||
# www
|
||||
#
|
||||
class PageNotFound(ModuleError):
|
||||
"""Page not found at ArteTV."""
|
||||
|
||||
|
||||
class PageNotSupported(ExpectedError):
|
||||
"""The page you are trying to download from is not (yet) supported."""
|
||||
|
||||
|
||||
class InvalidPage(UnexpectedError):
|
||||
"""Invalid ArteTV page."""
|
||||
|
||||
|
||||
#
|
||||
# Others
|
||||
#
|
||||
class UnexpectedAPIResponse(UnexpectedError):
|
||||
"""Unexpected response from ArteTV."""
|
||||
|
||||
|
|
|
@ -10,15 +10,18 @@ from typing import NamedTuple, Optional
|
|||
class ProgramMeta(NamedTuple):
|
||||
"""A program metadata."""
|
||||
|
||||
site: str
|
||||
"""The site where it is hosted (fr, de, etc...)."""
|
||||
|
||||
id: str
|
||||
"""The ID."""
|
||||
|
||||
title: str
|
||||
"""The title."""
|
||||
|
||||
subtitle: str
|
||||
"""The subtitle or secondary title."""
|
||||
|
||||
description: str
|
||||
"""The description."""
|
||||
|
||||
|
||||
class VideoMeta(NamedTuple):
|
||||
"""A video track metadata."""
|
||||
|
@ -91,18 +94,10 @@ class Rendition(NamedTuple):
|
|||
source: tuple[str, Optional[str]]
|
||||
|
||||
|
||||
class Program(NamedTuple):
|
||||
"""A program representation."""
|
||||
|
||||
id: str
|
||||
slug: str
|
||||
meta: ProgramMeta
|
||||
|
||||
|
||||
class Sources(NamedTuple):
|
||||
"""A program's sources."""
|
||||
|
||||
program: Program
|
||||
program: ProgramMeta
|
||||
variants: list[Variant]
|
||||
renditions: list[Rendition]
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
import re
|
||||
|
||||
from typing import Optional
|
||||
from .model import Program, VideoMeta, AudioMeta, SubtitlesMeta
|
||||
from .model import VideoMeta, AudioMeta, SubtitlesMeta
|
||||
|
||||
|
||||
def file_name_builder(
|
||||
|
@ -14,7 +14,6 @@ def file_name_builder(
|
|||
s_meta: Optional[SubtitlesMeta],
|
||||
*,
|
||||
use_id=False,
|
||||
use_slug=False,
|
||||
sep=" - ",
|
||||
seq_pfx=" - ",
|
||||
seq_no_pad=False,
|
||||
|
@ -32,17 +31,14 @@ def file_name_builder(
|
|||
def replace_sequence_counter(s: str) -> str:
|
||||
return re.sub(r"\s+\((\d+)/(\d+)\)", sub_sequence_counter, s)
|
||||
|
||||
def build_file_name(program: Program) -> str:
|
||||
def build_file_name(p_meta) -> str:
|
||||
"""Create a file name for given program."""
|
||||
if use_id:
|
||||
return program.id
|
||||
return p_meta.id
|
||||
|
||||
if use_slug:
|
||||
return program.slug
|
||||
|
||||
fields = [replace_sequence_counter(program.meta.title)]
|
||||
if program.meta.subtitle:
|
||||
fields.add(replace_sequence_counter(program.meta.subtitles))
|
||||
fields = [replace_sequence_counter(p_meta.title)]
|
||||
if p_meta.subtitle:
|
||||
fields.append(replace_sequence_counter(p_meta.subtitle))
|
||||
|
||||
if add_resolution:
|
||||
fields.append(f"{v_meta.height}p")
|
||||
|
|
|
@ -3,28 +3,85 @@
|
|||
|
||||
"""Provide ArteTV website utilities."""
|
||||
|
||||
from .error import InvalidUrl
|
||||
from contextlib import contextmanager
|
||||
import json
|
||||
|
||||
BASE = "https://www.arte.tv/"
|
||||
SITES = ["fr", "de", "en", "es", "pl", "it"]
|
||||
from .error import InvalidPage, PageNotFound, PageNotSupported
|
||||
from .model import ProgramMeta
|
||||
|
||||
_DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'
|
||||
|
||||
|
||||
def parse_url(url):
|
||||
"""Parse ArteTV web URL into target ID and web UI language."""
|
||||
if not url.startswith(BASE):
|
||||
raise InvalidUrl("BASE", url)
|
||||
@contextmanager
|
||||
def _schema_guard(*context):
|
||||
try:
|
||||
yield
|
||||
except (KeyError, IndexError, ValueError) as e:
|
||||
raise InvalidPage("SCHEMA", *context, e)
|
||||
|
||||
path = url[len(BASE) :].split("/")
|
||||
|
||||
site = path.pop(0)
|
||||
def _process_programs_page(page_value):
|
||||
|
||||
if site not in SITES:
|
||||
raise InvalidUrl("SITE", url, site)
|
||||
with _schema_guard():
|
||||
site = page_value["language"]
|
||||
|
||||
if (_ := path.pop(0)) != "videos":
|
||||
raise InvalidUrl("PATH", url, _)
|
||||
content_zones = [
|
||||
zone
|
||||
for zone in page_value["zones"]
|
||||
if zone["code"].startswith("program_content_")
|
||||
]
|
||||
|
||||
id = path.pop(0)
|
||||
slug = path.pop(0)
|
||||
programs = [
|
||||
ProgramMeta(
|
||||
site, data_item["programId"], data_item["title"], data_item["subtitle"]
|
||||
)
|
||||
for zone in content_zones
|
||||
for data_item in zone["content"]["data"]
|
||||
if data_item["type"] == "program"
|
||||
]
|
||||
|
||||
return site, id, slug
|
||||
if len(content_zones) != 1:
|
||||
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
|
||||
|
||||
if len(programs) != 1:
|
||||
raise InvalidPage("PROGRAMS_PROGRAMS_COUNT")
|
||||
|
||||
return programs[0]
|
||||
|
||||
|
||||
def fetch_program(http_session, url):
|
||||
"""Load the ArteTV page at given URL and return list of programs on it."""
|
||||
r = http_session.get(url)
|
||||
|
||||
# special handling of 404
|
||||
if r.status_code == 404:
|
||||
raise PageNotFound(url)
|
||||
|
||||
# other network errors
|
||||
r.raise_for_status()
|
||||
|
||||
# no HTML parsing required, whe just find the mark
|
||||
html = r.text
|
||||
start = html.find(_DATA_MARK)
|
||||
if start < 0:
|
||||
raise InvalidPage("DATA_MARK_NOT_FOUND", url)
|
||||
start += len(_DATA_MARK)
|
||||
end = html.index("</script>", start)
|
||||
|
||||
try:
|
||||
next_js_data = json.loads(html[start:end].strip())
|
||||
except json.JSONDecodeError:
|
||||
raise InvalidPage("INVALID_JSON_DATA", url)
|
||||
|
||||
with _schema_guard(url):
|
||||
initial_page_value = next_js_data["props"]["pageProps"]["initialPage"]["value"]
|
||||
initial_type = next_js_data["props"]["pageProps"]["initialType"]
|
||||
|
||||
try:
|
||||
match initial_type:
|
||||
case "programs":
|
||||
return _process_programs_page(initial_page_value)
|
||||
case _:
|
||||
raise PageNotSupported("TYPE_NOT_SUPPORTED", url, initial_type)
|
||||
except InvalidPage as e:
|
||||
raise InvalidPage(e.args[0], url, *e.args[1:])
|
||||
|
|
Loading…
Reference in New Issue