padpo/padpo/checkers/grammalecte.py

176 lines
6.1 KiB
Python
Raw Normal View History

2019-11-22 11:50:24 +00:00
"""Checker for grammar errors."""
import json
import re
import subprocess
import tempfile
from pathlib import Path
from typing import Set
2019-11-22 11:50:24 +00:00
from zipfile import ZipFile
import requests
import simplelogging
from padpo.checkers.baseclass import Checker, replace_quotes
from padpo.pofile import PoItem, PoFile
log = simplelogging.get_logger()
class GrammalecteChecker(Checker):
"""Checker for grammar errors."""
2019-11-22 12:26:31 +00:00
name = "Grammalecte"
2019-11-22 11:50:24 +00:00
def __init__(self):
"""Initialiser."""
2019-11-22 12:26:31 +00:00
super().__init__()
2019-11-22 11:50:24 +00:00
self.dir = None
self.personal_dict: Set[str] = set()
self.get_personal_dict()
2019-11-22 11:50:24 +00:00
2019-11-22 15:28:57 +00:00
@staticmethod
def run_grammalecte(filename: str) -> subprocess.CompletedProcess:
return subprocess.run(
[
"grammalecte-cli.py",
"-f",
filename,
"-off",
"apos",
"--json",
"--only_when_errors",
],
capture_output=True,
text=True,
)
2019-11-22 11:50:24 +00:00
def check_file(self, pofile: PoFile):
"""Check a `*.po` file."""
if not isinstance(pofile, PoFile):
log.error("%s is not an instance of PoFile", str(pofile))
2019-11-22 15:28:57 +00:00
_, filename = tempfile.mkstemp(
suffix=".txt", prefix="padpo_", text=True
)
with open(filename, "w", encoding="utf8") as f:
2019-11-22 11:50:24 +00:00
text = pofile.rst2txt()
text = re.sub(r"«\s(.*?)\", replace_quotes, text)
f.write(text)
try:
2019-11-22 15:28:57 +00:00
result = self.run_grammalecte(filename)
2019-11-22 11:50:24 +00:00
except FileNotFoundError as e:
if e.filename == "grammalecte-cli.py":
install_grammalecte()
2019-11-22 15:28:57 +00:00
result = self.run_grammalecte(filename)
2019-11-22 11:50:24 +00:00
if result.stdout:
warnings = json.loads(result.stdout)
2019-12-03 08:14:56 +00:00
self.manage_grammar_errors(warnings, pofile)
2019-12-03 08:27:05 +00:00
self.manage_spelling_errors(warnings, pofile)
2019-11-22 15:28:57 +00:00
Path(filename).unlink()
2019-11-22 11:50:24 +00:00
def check_item(self, item: PoItem):
"""Check an item in a `*.po` file (does nothing)."""
pass
2019-12-03 08:14:56 +00:00
def manage_grammar_errors(self, warnings, pofile: PoFile):
2019-12-03 08:27:05 +00:00
"""Manage grammar errors returned by grammalecte."""
2019-12-03 08:14:56 +00:00
for warning in warnings["data"]:
for error in warning["lGrammarErrors"]:
if self.filter_out_grammar_error(error):
continue
item_index = int(warning["iParagraph"]) // 2
item = pofile.content[item_index]
start = max(0, int(error["nStart"]) - 40)
end = max(0, int(error["nEnd"]) + 10)
item.add_warning(
self.name,
2019-12-03 08:27:05 +00:00
f'{error["sMessage"]} => '
f"###{item.msgstr_rst2txt[start:end]}###",
2019-12-03 08:14:56 +00:00
)
def filter_out_grammar_error(self, error):
2019-11-22 11:50:24 +00:00
"""Return True when grammalecte error should be ignored."""
msg = error["sRuleId"]
2019-12-02 18:19:01 +00:00
if msg in (
"esp_milieu_ligne", # double space
"nbsp_avant_deux_points", # NBSP
"nbsp_avant_double_ponctuation", # NBSP
):
2019-11-22 11:50:24 +00:00
return True
2019-12-02 18:19:01 +00:00
if "typo_guillemets_typographiques_simples" in msg:
return True # ignore ' quotes
msg_text = error["sMessage"]
if msg_text in (
"Accord de genre erroné : « ABC » est masculin.",
"Accord de genre erroné : « PEP » est masculin.",
"Accord de nombre erroné : « PEP » devrait être au pluriel.",
2019-12-02 18:23:02 +00:00
"Accord de genre erroné : « une entrée » est féminin, « utilisateur » est masculin.",
2019-12-02 18:19:01 +00:00
):
2019-11-22 11:50:24 +00:00
return True
2019-12-02 18:30:54 +00:00
if "Sil sagit dun impératif" in msg_text:
if error["nStart"] == 0:
# ignore imperative conjugation at begining of 1st sentence
return True
2019-11-22 11:50:24 +00:00
return False
2019-12-03 08:27:05 +00:00
def manage_spelling_errors(self, warnings, pofile: PoFile):
"""Manage spelling errors returned by grammalecte."""
for warning in warnings["data"]:
for error in warning["lSpellingErrors"]:
if self.filter_out_spelling_error(error):
continue
item_index = int(warning["iParagraph"]) // 2
item = pofile.content[item_index]
start = max(0, int(error["nStart"]) - 40)
end = max(0, int(error["nEnd"]) + 10)
word = error["sValue"]
item.add_warning(
self.name,
f'Unknown word "{word}" in '
f"###{item.msgstr_rst2txt[start:end]}###",
2019-12-03 08:27:05 +00:00
)
def filter_out_spelling_error(self, error):
"""Return True when grammalecte error should be ignored."""
word = error["sValue"]
if set(word) == {"x"}:
return True # word is xxxxx or xxxxxxxx…
if word.strip() in self.personal_dict:
return True # white list
2019-12-03 08:27:05 +00:00
return False
def get_personal_dict(self):
"""
Add spelling white list.
Based on
https://raw.githubusercontent.com/python/python-docs-fr/3.8/dict
"""
download_request = requests.get(
"https://raw.githubusercontent.com/python/python-docs-fr/3.8/dict"
)
download_request.raise_for_status()
for line in download_request.text.splitlines():
word = line.strip()
self.personal_dict.add(word)
self.personal_dict.add(word.title())
2019-11-22 11:50:24 +00:00
def install_grammalecte():
"""Install grammalecte CLI."""
log.warning("Missing grammalecte, trying to install it")
2019-12-03 09:14:52 +00:00
tmpdirname = tempfile.mkdtemp(prefix="padpo_grammalecte_")
2019-11-22 11:50:24 +00:00
tmpdirname = Path(tmpdirname)
tmpdirname.mkdir(exist_ok=True)
download_request = requests.get(
"https://grammalecte.net/grammalecte/zip/Grammalecte-fr-v1.5.0.zip"
)
download_request.raise_for_status()
zip_file = tmpdirname / "Grammalecte-fr-v1.5.0.zip"
zip_file.write_bytes(download_request.content)
with ZipFile(zip_file, "r") as zip_obj:
zip_obj.extractall(tmpdirname / "Grammalecte-fr-v1.5.0")
subprocess.run(
["pip", "install", str(tmpdirname / "Grammalecte-fr-v1.5.0")]
)