Refactor website URL parsing

Lighter implementation and using `target_id` instead of `program_id`,
preparing for #7
This commit is contained in:
Barbagus 2022-12-27 07:52:35 +01:00
parent 4c518993ef
commit 305d8ab679
1 changed files with 13 additions and 18 deletions

View File

@ -3,9 +3,9 @@
"""Provide ArteTV website utilities."""
from urllib.parse import urlparse
from . import common
BASE = "https://www.arte.tv/"
LANGUAGES = ["fr", "de", "en", "es", "pl", "it"]
@ -13,26 +13,21 @@ class InvalidUrl(common.Error):
"""Invalid ArteTV URL."""
def parse_url(program_page_url):
"""Parse ArteTV web URL into UI language and program ID."""
try:
url = urlparse(program_page_url)
except ValueError:
raise InvalidUrl("URL_PARSE", program_page_url, url.hostname)
def parse_url(url):
"""Parse ArteTV web URL into target ID and web UI language."""
if not url.startswith(BASE):
raise InvalidUrl("BASE", url)
if url.hostname != "www.arte.tv":
raise InvalidUrl("HOST_NAME", program_page_url, url.hostname)
path = url[len(BASE) :].split("/")
program_page_path = url.path.split("/")[1:]
www_lang = path.pop(0)
lang = program_page_path.pop(0)
if www_lang not in LANGUAGES:
raise InvalidUrl("WWW_LANG", url, www_lang)
if lang not in LANGUAGES:
raise InvalidUrl("WWW_LANGUAGE", program_page_url, lang)
if (_ := path.pop(0)) != "videos":
raise InvalidUrl("PATH", url, _)
if program_page_path.pop(0) != "videos":
raise InvalidUrl("PATH", program_page_url, program_page_path)
target_id = path.pop(0)
program_id = program_page_path.pop(0)
return lang, program_id
return www_lang, target_id