padpo/padpo/checkers/grammalecte.py

128 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Checker for grammar errors."""
import re
from pathlib import Path
from typing import Set, Optional
import requests
import simplelogging
from pygrammalecte import (
GrammalecteGrammarMessage,
GrammalecteMessage,
GrammalecteSpellingMessage,
grammalecte_text,
)
from padpo.checkers.baseclass import Checker, replace_quotes
from padpo.checkers.glossary import glossary
from padpo.pofile import PoFile, PoItem
log = simplelogging.get_logger()
class GrammalecteChecker(Checker):
"""Checker for grammar errors."""
name = "Grammalecte"
def __init__(self):
"""Initialiser."""
super().__init__()
self.personal_dict: Set[str] = set()
def check_file(self, pofile: PoFile):
"""Check a `*.po` file."""
if not isinstance(pofile, PoFile):
log.error("%s is not an instance of PoFile", str(pofile))
text = pofile.rst2txt()
text = re.sub(r"«\s(.*?)\", replace_quotes, text)
warnings = grammalecte_text(text)
self.manage_warnings(warnings, pofile)
def check_item(self, item: PoItem):
"""Check an item in a `*.po` file (does nothing)."""
pass
def manage_warnings(self, warnings: GrammalecteMessage, pofile: PoFile) -> None:
"""Manage warnings returned by grammalecte."""
for warning in warnings:
if self.filter_out_grammar_error(warning) or self.filter_out_spelling_error(
warning
):
continue
item_index = warning.line // 2
item = pofile.content[item_index]
start = max(0, warning.start - 40)
end = warning.end + 10
item.add_warning(
self.name,
f"{warning.message} => " f"###{item.msgstr_rst2txt[start:end]}###",
)
def filter_out_grammar_error(self, warning: GrammalecteMessage) -> bool:
"""Return True when grammalecte error should be ignored."""
if not isinstance(warning, GrammalecteGrammarMessage):
return False
if warning.rule in (
"esp_milieu_ligne", # double space
"nbsp_avant_deux_points", # NBSP
"nbsp_avant_double_ponctuation", # NBSP
):
return True
if "typo_guillemets_typographiques_simples" in warning.rule:
return True # ignore ' quotes
if warning.message in (
"Accord de genre erroné : « ABC » est masculin.",
"Accord de genre erroné : « PEP » est masculin.",
"Accord de nombre erroné : « PEP » devrait être au pluriel.",
"Accord de genre erroné : « une entrée » est féminin, « utilisateur » est masculin.",
):
return True
if "Sil sagit dun impératif" in warning.message:
if warning.start == 0:
# ignore imperative conjugation at begining of 1st sentence
return True
return False
def filter_out_spelling_error(self, warning: GrammalecteMessage) -> bool:
"""Return True when grammalecte error should be ignored."""
if not isinstance(warning, GrammalecteSpellingMessage):
return False
if set(warning.word) == {"x"}:
return True # word is xxxxx or xxxxxxxx…
if warning.word.strip() in self.personal_dict:
return True # white list
if warning.word.endswith("_"):
return True
if warning.word.lower() in glossary:
return True
if warning.word.lower() == "uplet": # partially italic word in glossary
return True
return False
def _get_personal_dict(self, dict_path: str) -> None:
if "://" in dict_path:
download_request = requests.get(dict_path)
download_request.raise_for_status()
lines = download_request.text
else:
lines = Path(dict_path).read_text(encoding="UTF-8")
for line in lines.splitlines():
word = line.strip()
self.personal_dict.add(word)
self.personal_dict.add(word.title())
def add_arguments(self, parser):
parser.add_argument(
"--dict",
nargs="*",
dest="dicts",
help="Personal dict files or URLs. Should contain onw word per line.",
)
def configure(self, args):
"""Store the result of parse_args, to get back arguments from self.add_arguments."""
if args.dicts:
for dict_path in args.dicts:
self._get_personal_dict(dict_path)