502 lines
15 KiB
Python
502 lines
15 KiB
Python
"""pospell is a spellcheckers for po files containing reStructuedText."""
|
||
import collections
|
||
import functools
|
||
import io
|
||
import logging
|
||
import multiprocessing
|
||
import os
|
||
import subprocess
|
||
import sys
|
||
from contextlib import redirect_stderr
|
||
from itertools import chain
|
||
from pathlib import Path
|
||
from shutil import which
|
||
from string import digits
|
||
from typing import List, Tuple
|
||
from unicodedata import category
|
||
|
||
import docutils.frontend
|
||
import docutils.nodes
|
||
import docutils.parsers.rst
|
||
import polib
|
||
import regex
|
||
from docutils.parsers.rst import roles
|
||
from docutils.utils import new_document
|
||
from sphinxlint import rst
|
||
|
||
__version__ = "1.3"
|
||
|
||
DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
|
||
|
||
Error = Tuple[str, int, str]
|
||
|
||
input_line = collections.namedtuple("input_line", "filename line text")
|
||
|
||
|
||
class POSpellException(Exception):
|
||
"""All exceptions from this module inherit from this one."""
|
||
|
||
|
||
class Unreachable(POSpellException):
|
||
"""The code encontered a state that should be unreachable."""
|
||
|
||
|
||
try:
|
||
HUNSPELL_VERSION = subprocess.check_output(
|
||
["hunspell", "--version"], universal_newlines=True
|
||
).split("\n", maxsplit=1)[0]
|
||
except FileNotFoundError:
|
||
print("hunspell not found, please install hunspell.", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
|
||
class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
|
||
"""Used to represent any unknown roles, so we can parse any rst blindly."""
|
||
|
||
|
||
def monkey_patch_role(role):
|
||
"""Patch docutils.parsers.rst.roles.role so it always match.
|
||
|
||
Giving a DummyNodeClass for unknown roles.
|
||
"""
|
||
|
||
def role_or_generic(role_name, language_module, lineno, reporter):
|
||
base_role, message = role(role_name, language_module, lineno, reporter)
|
||
if base_role is None:
|
||
roles.register_generic_role(role_name, DummyNodeClass)
|
||
base_role, message = role(role_name, language_module, lineno, reporter)
|
||
return base_role, message
|
||
|
||
return role_or_generic
|
||
|
||
|
||
roles.role = monkey_patch_role(roles.role)
|
||
|
||
|
||
class NodeToTextVisitor(docutils.nodes.NodeVisitor):
|
||
"""Recursively convert a docutils node to a Python string.
|
||
|
||
Usage:
|
||
|
||
>>> visitor = NodeToTextVisitor(document)
|
||
>>> document.walk(visitor)
|
||
>>> print(str(visitor))
|
||
|
||
It ignores (see IGNORE_LIST) some nodes, which we don't want in
|
||
hunspell (enphasis typically contain proper names that are unknown
|
||
to dictionaires).
|
||
"""
|
||
|
||
IGNORE_LIST = (
|
||
"emphasis",
|
||
"superscript",
|
||
"title_reference",
|
||
"substitution_reference",
|
||
"citation_reference",
|
||
"strong",
|
||
"DummyNodeClass",
|
||
"reference",
|
||
"literal",
|
||
"Text",
|
||
"system_message",
|
||
)
|
||
|
||
def __init__(self, document):
|
||
"""Initialize visitor for the given node/document."""
|
||
self.output = []
|
||
super().__init__(document)
|
||
|
||
def unknown_visit(self, node):
|
||
"""Mandatory implementation to visit unknwon nodes."""
|
||
|
||
@staticmethod
|
||
def ignore(node):
|
||
"""Just raise SkipChildren.
|
||
|
||
Used for all visit_* in the IGNORE_LIST.
|
||
|
||
See __getattr__.
|
||
"""
|
||
raise docutils.nodes.SkipChildren
|
||
|
||
def __getattr__(self, name):
|
||
"""Skip childrens from the IGNORE_LIST."""
|
||
if name.startswith("visit_") and name[6:] in self.IGNORE_LIST:
|
||
return self.ignore
|
||
raise AttributeError(name)
|
||
|
||
def visit_Text(self, node):
|
||
"""Keep this node text, this is typically what we want to spell check."""
|
||
self.output.append(docutils.nodes.unescape(node, restore_backslashes=True))
|
||
|
||
def __str__(self):
|
||
"""Give the accumulated strings."""
|
||
return " ".join(self.output)
|
||
|
||
|
||
def strip_rst(line):
|
||
"""Transform reStructuredText to plain text."""
|
||
if line.endswith("::"):
|
||
# Drop :: at the end, it would cause Literal block expected
|
||
line = line[:-2]
|
||
line = rst.NORMAL_ROLE_RE.sub("", line)
|
||
settings = docutils.frontend.get_default_settings()
|
||
settings.pep_references = None
|
||
settings.rfc_references = None
|
||
settings.pep_base_url = "http://www.python.org/dev/peps/"
|
||
settings.pep_file_url_template = "pep-%04d"
|
||
parser = docutils.parsers.rst.Parser()
|
||
stderr_stringio = io.StringIO()
|
||
with redirect_stderr(stderr_stringio):
|
||
document = new_document("<rst-doc>", settings=settings)
|
||
parser.parse(line, document)
|
||
stderr = stderr_stringio.getvalue()
|
||
if stderr:
|
||
print(stderr.strip(), "while parsing:", line)
|
||
visitor = NodeToTextVisitor(document)
|
||
document.walk(visitor)
|
||
return str(visitor)
|
||
|
||
|
||
def clear(line, drop_capitalized=False, po_path=""):
|
||
"""Clear various other syntaxes we may encounter in a line."""
|
||
# Normalize spaces
|
||
line = regex.sub(r"\s+", " ", line).replace("\xad", "")
|
||
|
||
to_drop = {
|
||
r'<a href="[^"]*?">',
|
||
r"{[a-z_]*?}", # Sphinx variable
|
||
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
|
||
r"« . »", # Single letter examples (typically in Unicode documentation)
|
||
}
|
||
if drop_capitalized:
|
||
to_drop.add(
|
||
# Strip capitalized words in sentences
|
||
r"(?<!\. |^|-)\b(\p{Letter}['’])?\b\p{Uppercase}\p{Letter}[\w.-]*\b"
|
||
)
|
||
if logging.getLogger().isEnabledFor(logging.DEBUG):
|
||
for pattern in to_drop:
|
||
for dropped in regex.findall(pattern, line):
|
||
logging.debug(
|
||
"%s: dropping %r via %r due to from %r",
|
||
po_path,
|
||
dropped,
|
||
pattern,
|
||
line,
|
||
)
|
||
return regex.sub("|".join(to_drop), r" ", line)
|
||
|
||
|
||
def quote_for_hunspell(text):
|
||
"""Quote a paragraph so hunspell don't misinterpret it.
|
||
|
||
Quoting the manpage:
|
||
It is recommended that programmatic interfaces prefix
|
||
every data line with an uparrow to protect themselves
|
||
against future changes in hunspell.
|
||
"""
|
||
out = []
|
||
for line in text:
|
||
out.append("^" + line if line else "")
|
||
return "\n".join(out)
|
||
|
||
|
||
def po_to_text(po_path, drop_capitalized=False):
|
||
"""Convert a po file to a text file.
|
||
|
||
This strips the msgids and all po syntax while keeping lines at
|
||
their same position / line number.
|
||
"""
|
||
input_lines = []
|
||
lines = 0
|
||
try:
|
||
entries = polib.pofile(Path(po_path).read_text(encoding="UTF-8"))
|
||
except Exception as err:
|
||
raise POSpellException(str(err)) from err
|
||
for entry in entries:
|
||
if entry.msgid == entry.msgstr:
|
||
continue
|
||
if entry.obsolete:
|
||
continue
|
||
while lines < entry.linenum:
|
||
lines += 1
|
||
input_lines.append(input_line(po_path, lines, ""))
|
||
lines += 1
|
||
input_lines.append(
|
||
input_line(
|
||
po_path,
|
||
lines,
|
||
clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path),
|
||
)
|
||
)
|
||
return input_lines
|
||
|
||
|
||
def parse_args():
|
||
"""Parse command line arguments."""
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(
|
||
description="Check spelling in po files containing restructuredText."
|
||
)
|
||
parser.add_argument(
|
||
"-l",
|
||
"--language",
|
||
type=str,
|
||
default="fr",
|
||
help="Language to check, you'll have to install the corresponding "
|
||
"hunspell dictionary, on Debian see apt list 'hunspell-*' (defaults to 'fr').",
|
||
)
|
||
parser.add_argument(
|
||
"--glob",
|
||
type=str,
|
||
help="Provide a glob pattern, to be interpreted by pospell, to find po files, "
|
||
"like --glob '**/*.po'.",
|
||
)
|
||
parser.add_argument(
|
||
"--drop-capitalized",
|
||
action="store_true",
|
||
help="Always drop capitalized words in sentences"
|
||
" (defaults according to the language).",
|
||
)
|
||
parser.add_argument(
|
||
"--no-drop-capitalized",
|
||
action="store_true",
|
||
help="Never drop capitalized words in sentences"
|
||
" (defaults according to the language).",
|
||
)
|
||
parser.add_argument(
|
||
"po_file",
|
||
nargs="*",
|
||
type=Path,
|
||
help="Files to check, can optionally be mixed with --glob, or not, "
|
||
"use the one that fit your needs.",
|
||
)
|
||
parser.add_argument(
|
||
"-v",
|
||
"--verbose",
|
||
action="count",
|
||
default=0,
|
||
help="More output, use -vv, -vvv, and so on.",
|
||
)
|
||
parser.add_argument(
|
||
"--version",
|
||
action="version",
|
||
version="%(prog)s " + __version__ + " using hunspell: " + HUNSPELL_VERSION,
|
||
)
|
||
parser.add_argument("--debug", action="store_true")
|
||
parser.add_argument("-p", "--personal-dict", type=Path)
|
||
parser.add_argument(
|
||
"--modified", "-m", action="store_true", help="Use git to find modified files."
|
||
)
|
||
parser.add_argument(
|
||
"-j",
|
||
"--jobs",
|
||
type=int,
|
||
default=os.cpu_count(),
|
||
help="Number of files to check in paralel, defaults to all available CPUs",
|
||
)
|
||
args = parser.parse_args()
|
||
if args.personal_dict is not None and not args.personal_dict.exists():
|
||
print(f"Error: dictionary {str(args.personal_dict)!r} not found.")
|
||
sys.exit(1)
|
||
if args.drop_capitalized and args.no_drop_capitalized:
|
||
print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
|
||
parser.print_help()
|
||
sys.exit(1)
|
||
if not args.po_file and not args.modified and not args.glob:
|
||
parser.print_help()
|
||
sys.exit(1)
|
||
return args
|
||
|
||
|
||
def look_like_a_word(word):
|
||
"""Return True if the given str looks like a word.
|
||
|
||
Used to filter out non-words like `---` or `-0700` so they don't
|
||
get reported. They typically are not errors.
|
||
"""
|
||
if not word:
|
||
return False
|
||
if any(digit in word for digit in digits):
|
||
return False
|
||
if len([c for c in word if category(c) == "Lu"]) > 1:
|
||
return False # Probably an accronym, or a name like CPython, macOS, SQLite, ...
|
||
if "-" in word:
|
||
return False
|
||
return True
|
||
|
||
|
||
def run_hunspell(language, personal_dict, input_lines) -> List[Error]:
|
||
"""Run hunspell over the given input lines."""
|
||
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
|
||
try:
|
||
output = subprocess.check_output(
|
||
["hunspell", "-d", language, "-a"] + personal_dict_arg,
|
||
universal_newlines=True,
|
||
input=quote_for_hunspell(text for _, _, text in input_lines),
|
||
)
|
||
except subprocess.CalledProcessError:
|
||
return []
|
||
return parse_hunspell_output(input_lines, output.splitlines())
|
||
|
||
|
||
def flatten(list_of_lists):
|
||
"""[[a,b,c], [d,e,f]] -> [a,b,c,d,e,f]."""
|
||
return [element for a_list in list_of_lists for element in a_list]
|
||
|
||
|
||
def spell_check(
|
||
po_files,
|
||
personal_dict=None,
|
||
language="en_US",
|
||
drop_capitalized=False,
|
||
debug_only=False,
|
||
jobs=os.cpu_count(),
|
||
):
|
||
"""Check for spelling mistakes in the given po_files.
|
||
|
||
(po format, containing restructuredtext), for the given language.
|
||
personal_dict allow to pass a personal dict (-p) option, to hunspell.
|
||
|
||
Debug only will show what's passed to Hunspell instead of passing it.
|
||
"""
|
||
# Pool.__exit__ calls terminate() instead of close(), we need the latter,
|
||
# which ensures the processes' atexit handlers execute fully, which in
|
||
# turn lets coverage write the sub-processes' coverage information
|
||
pool = multiprocessing.Pool(jobs) # pylint: disable=consider-using-with
|
||
try:
|
||
input_lines = flatten(
|
||
pool.map(
|
||
functools.partial(po_to_text, drop_capitalized=drop_capitalized),
|
||
po_files,
|
||
)
|
||
)
|
||
if debug_only:
|
||
for filename, line, text in input_lines:
|
||
print(filename, line, text, sep=":")
|
||
return 0
|
||
if not input_lines:
|
||
return 0
|
||
|
||
# Distribute input lines across workers
|
||
lines_per_job = (len(input_lines) + jobs - 1) // jobs
|
||
chunked_inputs = [
|
||
input_lines[i : i + lines_per_job]
|
||
for i in range(0, len(input_lines), lines_per_job)
|
||
]
|
||
errors = flatten(
|
||
pool.map(
|
||
functools.partial(run_hunspell, language, personal_dict),
|
||
chunked_inputs,
|
||
)
|
||
)
|
||
finally:
|
||
pool.close()
|
||
pool.join()
|
||
|
||
for error in errors:
|
||
print(*error, sep=":")
|
||
return len(errors)
|
||
|
||
|
||
def parse_hunspell_output(inputs, outputs) -> List[Error]:
|
||
"""Parse `hunspell -a` output and collect all errors."""
|
||
# skip first line of hunspell output (it's the banner)
|
||
outputs = iter(outputs[1:])
|
||
errors = []
|
||
for po_input_line, output_line in zip(inputs, outputs):
|
||
if not po_input_line.text:
|
||
continue
|
||
while output_line:
|
||
if output_line.startswith("&"):
|
||
_, original, *_ = output_line.split()
|
||
if look_like_a_word(original):
|
||
errors.append(
|
||
(po_input_line.filename, po_input_line.line, original)
|
||
)
|
||
try:
|
||
output_line = next(outputs)
|
||
except StopIteration:
|
||
break
|
||
return errors
|
||
|
||
|
||
def gracefull_handling_of_missing_dicts(language):
|
||
"""Check if hunspell dictionary for given language is installed."""
|
||
hunspell_dash_d = subprocess.check_output(
|
||
["hunspell", "-D"], universal_newlines=True, stderr=subprocess.STDOUT
|
||
)
|
||
languages = {Path(line).name for line in hunspell_dash_d}
|
||
|
||
def error(*args, file=sys.stderr, **kwargs):
|
||
print(*args, file=file, **kwargs)
|
||
|
||
if language in languages:
|
||
return
|
||
error(
|
||
"The hunspell dictionary for your language is missing, please install it.",
|
||
end="\n\n",
|
||
)
|
||
if which("apt"):
|
||
error("Maybe try something like:")
|
||
error(f" sudo apt install hunspell-{language}")
|
||
else:
|
||
error(
|
||
f"""I don't know your environment, but I bet the package name looks like:
|
||
|
||
hunspell-{language}
|
||
|
||
If you find it, please tell me (by opening an issue or a PR on
|
||
https://github.com/JulienPalard/pospell/) so I can enhance this error message.
|
||
"""
|
||
)
|
||
sys.exit(1)
|
||
|
||
|
||
def main():
|
||
"""Entry point (for command-line)."""
|
||
args = parse_args()
|
||
logging.basicConfig(level=50 - 10 * args.verbose)
|
||
default_drop_capitalized = DEFAULT_DROP_CAPITALIZED.get(args.language, False)
|
||
if args.drop_capitalized:
|
||
drop_capitalized = True
|
||
elif args.no_drop_capitalized:
|
||
drop_capitalized = False
|
||
else:
|
||
drop_capitalized = default_drop_capitalized
|
||
args.po_file = list(
|
||
chain(Path(".").glob(args.glob) if args.glob else [], args.po_file)
|
||
)
|
||
if args.modified:
|
||
git_status = subprocess.check_output(
|
||
["git", "status", "--porcelain", "--no-renames"], encoding="utf-8"
|
||
)
|
||
git_status_lines = [
|
||
line.split(maxsplit=2) for line in git_status.split("\n") if line
|
||
]
|
||
args.po_file.extend(
|
||
Path(filename)
|
||
for status, filename in git_status_lines
|
||
if filename.endswith(".po") and status != "D"
|
||
)
|
||
try:
|
||
errors = spell_check(
|
||
args.po_file,
|
||
args.personal_dict,
|
||
args.language,
|
||
drop_capitalized,
|
||
args.debug,
|
||
args.jobs,
|
||
)
|
||
except POSpellException as err:
|
||
print(err, file=sys.stderr)
|
||
sys.exit(-1)
|
||
if errors == -1:
|
||
gracefull_handling_of_missing_dicts(args.language)
|
||
sys.exit(0 if errors == 0 else -1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|