Get program information from page content
Changes the way the program information is figured out. From URL parsing to page content parsing. A massive JSON object is shipped within the HTML of the page, that's were we get what we need from. Side effects: - drop `slug` from the program's info - drop `slug` naming option - no `Program` / `ProgramMeta` distinction Includes some JSON samples.
This commit is contained in:
parent
ba2dd96b36
commit
639a8063a5
3407
samples/www/103011-000-A__l-assaut-du-capitole.json
Normal file
3407
samples/www/103011-000-A__l-assaut-du-capitole.json
Normal file
File diff suppressed because it is too large
Load Diff
3003
samples/www/109041-002-A__acquitted-saison-1-2-10.json
Normal file
3003
samples/www/109041-002-A__acquitted-saison-1-2-10.json
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -13,16 +13,14 @@ def fetch_sources(http_session, url):
|
||||||
"""Fetch sources at a given ArteTV page URL."""
|
"""Fetch sources at a given ArteTV page URL."""
|
||||||
from .api import fetch_program_info
|
from .api import fetch_program_info
|
||||||
from .hls import fetch_program_tracks
|
from .hls import fetch_program_tracks
|
||||||
from .www import parse_url
|
from .www import fetch_program
|
||||||
|
|
||||||
site, program_id, slug = parse_url(url)
|
p_meta = fetch_program(http_session, url)
|
||||||
|
|
||||||
variants = dict()
|
variants = dict()
|
||||||
renditions = dict()
|
renditions = dict()
|
||||||
|
|
||||||
p_meta, program_index_urls = fetch_program_info(http_session, site, program_id)
|
program_index_urls = fetch_program_info(http_session, p_meta)
|
||||||
|
|
||||||
program = Program(program_id, slug, p_meta)
|
|
||||||
|
|
||||||
for program_index_url in program_index_urls:
|
for program_index_url in program_index_urls:
|
||||||
v_tracks, a_track, s_track = fetch_program_tracks(
|
v_tracks, a_track, s_track = fetch_program_tracks(
|
||||||
|
@ -43,7 +41,7 @@ def fetch_sources(http_session, url):
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
return Sources(
|
return Sources(
|
||||||
program,
|
p_meta,
|
||||||
[Variant(key, source) for key, source in variants.items()],
|
[Variant(key, source) for key, source in variants.items()],
|
||||||
[Rendition(key, source) for key, source in renditions.items()],
|
[Rendition(key, source) for key, source in renditions.items()],
|
||||||
)
|
)
|
||||||
|
@ -146,7 +144,7 @@ def compile_sources(sources, **naming_options):
|
||||||
build_file_name = file_name_builder(v_meta, a_meta, s_meta, **naming_options)
|
build_file_name = file_name_builder(v_meta, a_meta, s_meta, **naming_options)
|
||||||
|
|
||||||
return Target(
|
return Target(
|
||||||
sources.program.meta,
|
sources.program,
|
||||||
VideoTrack(v_meta, v_url),
|
VideoTrack(v_meta, v_url),
|
||||||
AudioTrack(a_meta, a_url),
|
AudioTrack(a_meta, a_url),
|
||||||
SubtitlesTrack(s_meta, s_url) if s_meta else None,
|
SubtitlesTrack(s_meta, s_url) if s_meta else None,
|
||||||
|
|
|
@ -23,7 +23,6 @@ Options:
|
||||||
--version print current version of the program
|
--version print current version of the program
|
||||||
--debug on error, print debugging information
|
--debug on error, print debugging information
|
||||||
--name-use-id use the program ID
|
--name-use-id use the program ID
|
||||||
--name-use-slug use the URL slug
|
|
||||||
--name-sep=<sep> field separator [default: - ]
|
--name-sep=<sep> field separator [default: - ]
|
||||||
--name-seq-pfx=<pfx> sequence counter prefix [default: - ]
|
--name-seq-pfx=<pfx> sequence counter prefix [default: - ]
|
||||||
--name-seq-no-pad disable sequence zero-padding
|
--name-seq-no-pad disable sequence zero-padding
|
||||||
|
|
|
@ -27,38 +27,34 @@ def _fetch_api_data(http_session, path, object_type):
|
||||||
return obj["attributes"]
|
return obj["attributes"]
|
||||||
|
|
||||||
|
|
||||||
def fetch_program_info(http_session, site, program_id):
|
def fetch_program_info(http_session, p_meta):
|
||||||
"""Fetch the given program metadata and indexes."""
|
"""Fetch the given program metadata and indexes."""
|
||||||
obj = _fetch_api_data(http_session, f"config/{site}/{program_id}", "ConfigPlayer")
|
obj = _fetch_api_data(
|
||||||
|
http_session, f"config/{p_meta.site}/{p_meta.id}", "ConfigPlayer"
|
||||||
|
)
|
||||||
|
|
||||||
if (_ := obj["metadata"]["providerId"]) != program_id:
|
if (_ := obj["metadata"]["providerId"]) != p_meta.id:
|
||||||
raise UnexpectedAPIResponse(
|
raise UnexpectedAPIResponse(
|
||||||
"PROGRAM_ID_MISMATCH",
|
"PROGRAM_ID_MISMATCH",
|
||||||
site,
|
p_meta.site,
|
||||||
program_id,
|
p_meta.id,
|
||||||
_,
|
_,
|
||||||
)
|
)
|
||||||
|
|
||||||
program_meta = ProgramMeta(
|
|
||||||
obj["metadata"]["title"],
|
|
||||||
obj["metadata"]["subtitle"],
|
|
||||||
obj["metadata"]["description"],
|
|
||||||
)
|
|
||||||
|
|
||||||
program_index_urls = set()
|
program_index_urls = set()
|
||||||
|
|
||||||
for s in obj["streams"]:
|
for s in obj["streams"]:
|
||||||
if (_ := s["protocol"]) != "HLS_NG":
|
if (_ := s["protocol"]) != "HLS_NG":
|
||||||
raise UnsupportedHLSProtocol(site, program_id, _)
|
raise UnsupportedHLSProtocol(p_meta.site, p_meta.id, _)
|
||||||
|
|
||||||
if (program_index_url := s["url"]) in program_index_urls:
|
if (program_index_url := s["url"]) in program_index_urls:
|
||||||
raise UnexpectedAPIResponse(
|
raise UnexpectedAPIResponse(
|
||||||
"DUPLICATE_PROGRAM_INDEX_URL",
|
"DUPLICATE_PROGRAM_INDEX_URL",
|
||||||
site,
|
p_meta.site,
|
||||||
program_id,
|
p_meta.id,
|
||||||
program_index_url,
|
program_index_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
program_index_urls.add(program_index_url)
|
program_index_urls.add(program_index_url)
|
||||||
|
|
||||||
return program_meta, program_index_urls
|
return program_index_urls
|
||||||
|
|
|
@ -16,14 +16,32 @@ class ModuleError(Exception):
|
||||||
return f"{self.__class__}{self.args!r}"
|
return f"{self.__class__}{self.args!r}"
|
||||||
|
|
||||||
|
|
||||||
|
class ExpectedError(ModuleError):
|
||||||
|
"""A feature limitation to submit as an enhancement to developers."""
|
||||||
|
|
||||||
|
|
||||||
class UnexpectedError(ModuleError):
|
class UnexpectedError(ModuleError):
|
||||||
"""An error to report to developers."""
|
"""An error to report to developers."""
|
||||||
|
|
||||||
|
|
||||||
class InvalidUrl(ModuleError):
|
#
|
||||||
"""Invalid ArteTV URL."""
|
# www
|
||||||
|
#
|
||||||
|
class PageNotFound(ModuleError):
|
||||||
|
"""Page not found at ArteTV."""
|
||||||
|
|
||||||
|
|
||||||
|
class PageNotSupported(ExpectedError):
|
||||||
|
"""The page you are trying to download from is not (yet) supported."""
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidPage(UnexpectedError):
|
||||||
|
"""Invalid ArteTV page."""
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Others
|
||||||
|
#
|
||||||
class UnexpectedAPIResponse(UnexpectedError):
|
class UnexpectedAPIResponse(UnexpectedError):
|
||||||
"""Unexpected response from ArteTV."""
|
"""Unexpected response from ArteTV."""
|
||||||
|
|
||||||
|
|
|
@ -10,15 +10,18 @@ from typing import NamedTuple, Optional
|
||||||
class ProgramMeta(NamedTuple):
|
class ProgramMeta(NamedTuple):
|
||||||
"""A program metadata."""
|
"""A program metadata."""
|
||||||
|
|
||||||
|
site: str
|
||||||
|
"""The site where it is hosted (fr, de, etc...)."""
|
||||||
|
|
||||||
|
id: str
|
||||||
|
"""The ID."""
|
||||||
|
|
||||||
title: str
|
title: str
|
||||||
"""The title."""
|
"""The title."""
|
||||||
|
|
||||||
subtitle: str
|
subtitle: str
|
||||||
"""The subtitle or secondary title."""
|
"""The subtitle or secondary title."""
|
||||||
|
|
||||||
description: str
|
|
||||||
"""The description."""
|
|
||||||
|
|
||||||
|
|
||||||
class VideoMeta(NamedTuple):
|
class VideoMeta(NamedTuple):
|
||||||
"""A video track metadata."""
|
"""A video track metadata."""
|
||||||
|
@ -91,18 +94,10 @@ class Rendition(NamedTuple):
|
||||||
source: tuple[str, Optional[str]]
|
source: tuple[str, Optional[str]]
|
||||||
|
|
||||||
|
|
||||||
class Program(NamedTuple):
|
|
||||||
"""A program representation."""
|
|
||||||
|
|
||||||
id: str
|
|
||||||
slug: str
|
|
||||||
meta: ProgramMeta
|
|
||||||
|
|
||||||
|
|
||||||
class Sources(NamedTuple):
|
class Sources(NamedTuple):
|
||||||
"""A program's sources."""
|
"""A program's sources."""
|
||||||
|
|
||||||
program: Program
|
program: ProgramMeta
|
||||||
variants: list[Variant]
|
variants: list[Variant]
|
||||||
renditions: list[Rendition]
|
renditions: list[Rendition]
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from .model import Program, VideoMeta, AudioMeta, SubtitlesMeta
|
from .model import VideoMeta, AudioMeta, SubtitlesMeta
|
||||||
|
|
||||||
|
|
||||||
def file_name_builder(
|
def file_name_builder(
|
||||||
|
@ -14,7 +14,6 @@ def file_name_builder(
|
||||||
s_meta: Optional[SubtitlesMeta],
|
s_meta: Optional[SubtitlesMeta],
|
||||||
*,
|
*,
|
||||||
use_id=False,
|
use_id=False,
|
||||||
use_slug=False,
|
|
||||||
sep=" - ",
|
sep=" - ",
|
||||||
seq_pfx=" - ",
|
seq_pfx=" - ",
|
||||||
seq_no_pad=False,
|
seq_no_pad=False,
|
||||||
|
@ -32,17 +31,14 @@ def file_name_builder(
|
||||||
def replace_sequence_counter(s: str) -> str:
|
def replace_sequence_counter(s: str) -> str:
|
||||||
return re.sub(r"\s+\((\d+)/(\d+)\)", sub_sequence_counter, s)
|
return re.sub(r"\s+\((\d+)/(\d+)\)", sub_sequence_counter, s)
|
||||||
|
|
||||||
def build_file_name(program: Program) -> str:
|
def build_file_name(p_meta) -> str:
|
||||||
"""Create a file name for given program."""
|
"""Create a file name for given program."""
|
||||||
if use_id:
|
if use_id:
|
||||||
return program.id
|
return p_meta.id
|
||||||
|
|
||||||
if use_slug:
|
fields = [replace_sequence_counter(p_meta.title)]
|
||||||
return program.slug
|
if p_meta.subtitle:
|
||||||
|
fields.append(replace_sequence_counter(p_meta.subtitle))
|
||||||
fields = [replace_sequence_counter(program.meta.title)]
|
|
||||||
if program.meta.subtitle:
|
|
||||||
fields.add(replace_sequence_counter(program.meta.subtitles))
|
|
||||||
|
|
||||||
if add_resolution:
|
if add_resolution:
|
||||||
fields.append(f"{v_meta.height}p")
|
fields.append(f"{v_meta.height}p")
|
||||||
|
|
|
@ -3,28 +3,85 @@
|
||||||
|
|
||||||
"""Provide ArteTV website utilities."""
|
"""Provide ArteTV website utilities."""
|
||||||
|
|
||||||
from .error import InvalidUrl
|
from contextlib import contextmanager
|
||||||
|
import json
|
||||||
|
|
||||||
BASE = "https://www.arte.tv/"
|
from .error import InvalidPage, PageNotFound, PageNotSupported
|
||||||
SITES = ["fr", "de", "en", "es", "pl", "it"]
|
from .model import ProgramMeta
|
||||||
|
|
||||||
|
_DATA_MARK = '<script id="__NEXT_DATA__" type="application/json">'
|
||||||
|
|
||||||
|
|
||||||
def parse_url(url):
|
@contextmanager
|
||||||
"""Parse ArteTV web URL into target ID and web UI language."""
|
def _schema_guard(*context):
|
||||||
if not url.startswith(BASE):
|
try:
|
||||||
raise InvalidUrl("BASE", url)
|
yield
|
||||||
|
except (KeyError, IndexError, ValueError) as e:
|
||||||
|
raise InvalidPage("SCHEMA", *context, e)
|
||||||
|
|
||||||
path = url[len(BASE) :].split("/")
|
|
||||||
|
|
||||||
site = path.pop(0)
|
def _process_programs_page(page_value):
|
||||||
|
|
||||||
if site not in SITES:
|
with _schema_guard():
|
||||||
raise InvalidUrl("SITE", url, site)
|
site = page_value["language"]
|
||||||
|
|
||||||
if (_ := path.pop(0)) != "videos":
|
content_zones = [
|
||||||
raise InvalidUrl("PATH", url, _)
|
zone
|
||||||
|
for zone in page_value["zones"]
|
||||||
|
if zone["code"].startswith("program_content_")
|
||||||
|
]
|
||||||
|
|
||||||
id = path.pop(0)
|
programs = [
|
||||||
slug = path.pop(0)
|
ProgramMeta(
|
||||||
|
site, data_item["programId"], data_item["title"], data_item["subtitle"]
|
||||||
|
)
|
||||||
|
for zone in content_zones
|
||||||
|
for data_item in zone["content"]["data"]
|
||||||
|
if data_item["type"] == "program"
|
||||||
|
]
|
||||||
|
|
||||||
return site, id, slug
|
if len(content_zones) != 1:
|
||||||
|
raise InvalidPage("PROGRAMS_CONTENT_ZONES_COUNT")
|
||||||
|
|
||||||
|
if len(programs) != 1:
|
||||||
|
raise InvalidPage("PROGRAMS_PROGRAMS_COUNT")
|
||||||
|
|
||||||
|
return programs[0]
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_program(http_session, url):
|
||||||
|
"""Load the ArteTV page at given URL and return list of programs on it."""
|
||||||
|
r = http_session.get(url)
|
||||||
|
|
||||||
|
# special handling of 404
|
||||||
|
if r.status_code == 404:
|
||||||
|
raise PageNotFound(url)
|
||||||
|
|
||||||
|
# other network errors
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
# no HTML parsing required, whe just find the mark
|
||||||
|
html = r.text
|
||||||
|
start = html.find(_DATA_MARK)
|
||||||
|
if start < 0:
|
||||||
|
raise InvalidPage("DATA_MARK_NOT_FOUND", url)
|
||||||
|
start += len(_DATA_MARK)
|
||||||
|
end = html.index("</script>", start)
|
||||||
|
|
||||||
|
try:
|
||||||
|
next_js_data = json.loads(html[start:end].strip())
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
raise InvalidPage("INVALID_JSON_DATA", url)
|
||||||
|
|
||||||
|
with _schema_guard(url):
|
||||||
|
initial_page_value = next_js_data["props"]["pageProps"]["initialPage"]["value"]
|
||||||
|
initial_type = next_js_data["props"]["pageProps"]["initialType"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
match initial_type:
|
||||||
|
case "programs":
|
||||||
|
return _process_programs_page(initial_page_value)
|
||||||
|
case _:
|
||||||
|
raise PageNotSupported("TYPE_NOT_SUPPORTED", url, initial_type)
|
||||||
|
except InvalidPage as e:
|
||||||
|
raise InvalidPage(e.args[0], url, *e.args[1:])
|
||||||
|
|
Loading…
Reference in New Issue
Block a user