pospell/pospell.py

472 lines
14 KiB
Python
Raw Normal View History

2020-11-23 13:26:34 +00:00
"""pospell is a spellcheckers for po files containing reStructuedText."""
2018-07-27 19:57:44 +00:00
import io
from string import digits
from unicodedata import category
2018-07-31 22:20:03 +00:00
import logging
2018-07-23 15:37:50 +00:00
import subprocess
2018-07-28 22:58:20 +00:00
import sys
2020-11-23 13:26:34 +00:00
from typing import Dict
2019-08-20 14:38:03 +00:00
from contextlib import redirect_stderr
from itertools import chain
2018-07-23 15:37:50 +00:00
from pathlib import Path
from shutil import which
2018-07-27 09:38:17 +00:00
2018-07-27 19:57:44 +00:00
import docutils.frontend
import docutils.nodes
import docutils.parsers.rst
2018-07-28 22:58:20 +00:00
import polib
2018-07-27 19:57:44 +00:00
from docutils.parsers.rst import roles
from docutils.utils import new_document
2018-07-31 22:20:03 +00:00
import regex
2020-10-13 22:44:09 +00:00
__version__ = "1.0.11"
DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
class POSpellException(Exception):
2020-11-23 13:26:34 +00:00
"""All exceptions from this module inherit from this one."""
class Unreachable(POSpellException):
"""The code encontered a state that should be unreachable."""
2018-07-28 22:58:20 +00:00
try:
HUNSPELL_VERSION = subprocess.check_output(
["hunspell", "--version"], universal_newlines=True
).split("\n")[0]
except FileNotFoundError:
print("hunspell not found, please install hunspell.", file=sys.stderr)
sys.exit(1)
2018-07-28 22:58:20 +00:00
2018-07-27 19:57:44 +00:00
class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
2020-11-23 13:26:34 +00:00
"""Used to represent any unknown roles, so we can parse any rst blindly."""
2018-07-27 19:57:44 +00:00
def monkey_patch_role(role):
2020-11-23 13:26:34 +00:00
"""Patch docutils.parsers.rst.roles.role so it always match.
Giving a DummyNodeClass for unknown roles.
"""
2018-07-27 19:57:44 +00:00
def role_or_generic(role_name, language_module, lineno, reporter):
base_role, message = role(role_name, language_module, lineno, reporter)
if base_role is None:
roles.register_generic_role(role_name, DummyNodeClass)
base_role, message = role(role_name, language_module, lineno, reporter)
return base_role, message
return role_or_generic
roles.role = monkey_patch_role(roles.role)
class NodeToTextVisitor(docutils.nodes.NodeVisitor):
2020-11-23 13:26:34 +00:00
"""Recursively convert a docutils node to a Python string.
2019-07-26 15:40:48 +00:00
2020-11-23 13:26:34 +00:00
Usage:
2019-07-26 15:40:48 +00:00
2020-11-23 13:26:34 +00:00
>>> visitor = NodeToTextVisitor(document)
>>> document.walk(visitor)
>>> print(str(visitor))
2019-07-26 15:40:48 +00:00
2020-11-23 13:26:34 +00:00
It ignores (see IGNORE_LIST) some nodes, which we don't want in
hunspell (enphasis typically contain proper names that are unknown
to dictionaires).
"""
2019-07-26 15:40:48 +00:00
2020-11-23 13:26:34 +00:00
IGNORE_LIST = (
"emphasis",
"superscript",
"title_reference",
"strong",
"DummyNodeClass",
"reference",
"literal",
"Text",
)
2019-07-26 15:40:48 +00:00
2020-11-23 13:26:34 +00:00
def __init__(self, document):
"""Initialize visitor for the given node/document."""
self.output = []
super().__init__(document)
2019-07-26 15:40:48 +00:00
2020-11-23 13:26:34 +00:00
def unknown_visit(self, node):
"""Mandatory implementation to visit unknwon nodes."""
2019-07-26 15:40:48 +00:00
2020-11-23 13:26:34 +00:00
@staticmethod
def ignore(node):
"""Just raise SkipChildren.
2019-07-26 15:40:48 +00:00
2020-11-23 13:26:34 +00:00
Used for all visit_* in the IGNORE_LIST.
2019-07-26 15:40:48 +00:00
2020-11-23 13:26:34 +00:00
See __getattr__.
"""
2019-07-26 15:40:48 +00:00
raise docutils.nodes.SkipChildren
2020-11-23 13:26:34 +00:00
def __getattr__(self, name):
"""Skip childrens from the IGNORE_LIST."""
if name.startswith("visit_") and name[6:] in self.IGNORE_LIST:
return self.ignore
raise AttributeError(name)
2018-07-27 19:57:44 +00:00
def visit_Text(self, node):
2020-11-23 13:26:34 +00:00
"""Keep this node text, this is typically what we want to spell check."""
2018-07-27 19:57:44 +00:00
self.output.append(node.rawsource)
def __str__(self):
2020-11-23 13:26:34 +00:00
"""Give the accumulated strings."""
2018-07-27 19:57:44 +00:00
return " ".join(self.output)
2018-07-23 15:37:50 +00:00
def strip_rst(line):
2020-11-23 13:26:34 +00:00
"""Transform reStructuredText to plain text."""
2018-07-27 19:57:44 +00:00
if line.endswith("::"):
# Drop :: at the end, it would cause Literal block expected
line = line[:-2]
parser = docutils.parsers.rst.Parser()
settings = docutils.frontend.Values(
{
"report_level": 2,
"halt_level": 4,
"exit_status_level": 5,
"debug": None,
"warning_stream": None,
"error_encoding": "utf-8",
"error_encoding_error_handler": "backslashreplace",
"language_code": "en",
"id_prefix": "",
"auto_id_prefix": "id",
"pep_references": None,
"pep_base_url": "http://www.python.org/dev/peps/",
"pep_file_url_template": "pep-%04d",
"rfc_references": None,
"rfc_base_url": "http://tools.ietf.org/html/",
"tab_width": 8,
"trim_footnote_reference_space": None,
2020-10-12 12:42:33 +00:00
"syntax_highlight": "long",
}
)
2018-07-27 19:57:44 +00:00
stderr_stringio = io.StringIO()
with redirect_stderr(stderr_stringio):
2019-08-20 14:38:03 +00:00
document = new_document("<rst-doc>", settings=settings)
2018-07-27 19:57:44 +00:00
parser.parse(line, document)
stderr = stderr_stringio.getvalue()
if stderr:
print(stderr.strip(), "while parsing:", line)
visitor = NodeToTextVisitor(document)
document.walk(visitor)
return str(visitor)
2018-07-23 15:37:50 +00:00
def clear(line, drop_capitalized=False, po_path=""):
2020-10-11 13:33:09 +00:00
"""Clear various other syntaxes we may encounter in a line."""
# Normalize spaces
line = regex.sub(r"\s+", " ", line).replace("\xad", "")
2018-07-31 22:20:03 +00:00
to_drop = {
r'<a href="[^"]*?">',
2019-11-16 13:47:22 +00:00
r"{[a-z_]*?}", # Sphinx variable
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
r"« . »", # Single letter examples (typically in Unicode documentation)
2018-07-31 22:20:03 +00:00
}
if drop_capitalized:
to_drop.add(
# Strip capitalized words in sentences
r"(?<!\. |^|-)\b(\p{Letter}['])?\b\p{Uppercase}\p{Letter}[\w.-]*\b"
)
2018-07-31 22:20:03 +00:00
if logging.getLogger().isEnabledFor(logging.DEBUG):
for pattern in to_drop:
for dropped in regex.findall(pattern, line):
logging.debug(
"%s: dropping %r via %r due to from %r",
po_path,
dropped,
pattern,
line,
)
return regex.sub("|".join(to_drop), r" ", line)
def quote_for_hunspell(text):
2020-11-23 13:26:34 +00:00
"""Quote a paragraph so hunspell don't misinterpret it.
Quoting the manpage:
It is recommended that programmatic interfaces prefix
every data line with an uparrow to protect themselves
2020-11-23 13:26:34 +00:00
against future changes in hunspell.
"""
out = []
for line in text.split("\n"):
out.append("^" + line if line else "")
return "\n".join(out)
def po_to_text(po_path, drop_capitalized=False):
2020-11-23 13:26:34 +00:00
"""Convert a po file to a text file.
This strips the msgids and all po syntax while keeping lines at
their same position / line number.
"""
2018-07-23 15:37:50 +00:00
buffer = []
2018-07-23 17:24:10 +00:00
lines = 0
try:
entries = polib.pofile(Path(po_path).read_text())
except Exception as err:
raise POSpellException(str(err)) from err
2018-07-23 15:37:50 +00:00
for entry in entries:
if entry.msgid == entry.msgstr:
continue
2018-07-23 17:24:10 +00:00
while lines < entry.linenum:
buffer.append("")
2018-07-23 17:24:10 +00:00
lines += 1
buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
2018-07-23 17:24:10 +00:00
lines += 1
return "\n".join(buffer)
2018-07-23 15:37:50 +00:00
2018-07-28 22:58:20 +00:00
def parse_args():
2020-10-11 13:33:09 +00:00
"""Parse command line arguments."""
2018-07-23 15:37:50 +00:00
import argparse
2018-07-23 15:37:50 +00:00
parser = argparse.ArgumentParser(
description="Check spelling in po files containing restructuredText."
)
parser.add_argument(
"-l",
"--language",
type=str,
default="fr",
help="Language to check, you'll have to install the corresponding "
"hunspell dictionary, on Debian see apt list 'hunspell-*'.",
)
parser.add_argument(
"--glob",
type=str,
help="Provide a glob pattern, to be interpreted by pospell, to find po files, "
"like --glob '**/*.po'.",
)
2019-10-09 11:07:09 +00:00
parser.add_argument(
"--drop-capitalized",
2019-10-09 11:07:09 +00:00
action="store_true",
2020-11-23 13:26:34 +00:00
help="Always drop capitalized words in sentences"
" (defaults according to the language).",
)
parser.add_argument(
"--no-drop-capitalized",
action="store_true",
2020-11-23 13:26:34 +00:00
help="Never drop capitalized words in sentences"
" (defaults according to the language).",
2019-10-09 11:07:09 +00:00
)
parser.add_argument(
"po_file",
nargs="*",
type=Path,
help="Files to check, can optionally be mixed with --glob, or not, "
"use the one that fit your needs.",
)
2018-07-31 22:20:03 +00:00
parser.add_argument(
"-v",
"--verbose",
action="count",
default=0,
help="More output, use -vv, -vvv, and so on.",
)
2018-07-28 22:58:20 +00:00
parser.add_argument(
"--version",
action="version",
version="%(prog)s " + __version__ + " using hunspell: " + HUNSPELL_VERSION,
)
parser.add_argument("--debug", action="store_true")
parser.add_argument("-p", "--personal-dict", type=str)
parser.add_argument(
"--modified", "-m", action="store_true", help="Use git to find modified files."
)
args = parser.parse_args()
if args.drop_capitalized and args.no_drop_capitalized:
print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
parser.print_help()
sys.exit(1)
if not args.po_file and not args.modified:
parser.print_help()
sys.exit(1)
return args
2018-07-28 22:58:20 +00:00
def look_like_a_word(word):
2020-11-23 13:26:34 +00:00
"""Return True if the given str looks like a word.
Used to filter out non-words like `---` or `-0700` so they don't
get reported. They typically are not errors.
"""
if not word:
return False
if any(digit in word for digit in digits):
return False
if len([c for c in word if category(c) == "Lu"]) > 1:
return False # Probably an accronym, or a name like CPython, macOS, SQLite, ...
if "-" in word:
return False
return True
def spell_check(
po_files,
personal_dict=None,
language="en_US",
drop_capitalized=False,
debug_only=False,
):
2020-11-23 13:26:34 +00:00
"""Check for spelling mistakes in the given po_files.
(po format, containing restructuredtext), for the given language.
personal_dict allow to pass a personal dict (-p) option, to hunspell.
Debug only will show what's passed to Hunspell instead of passing it.
2018-07-28 22:58:20 +00:00
"""
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
texts_for_hunspell = {}
for po_file in po_files:
if debug_only:
print(po_to_text(str(po_file), drop_capitalized))
continue
texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized)
2020-11-23 13:26:34 +00:00
if debug_only:
return 0
try:
output = subprocess.run(
["hunspell", "-d", language, "-a"] + personal_dict_arg,
universal_newlines=True,
input=quote_for_hunspell("\n".join(texts_for_hunspell.values())),
stdout=subprocess.PIPE,
2020-11-23 13:26:34 +00:00
check=True,
)
except subprocess.CalledProcessError:
return -1
2020-11-23 13:26:34 +00:00
return parse_hunspell_output(texts_for_hunspell, output)
def parse_hunspell_output(hunspell_input: Dict[str, str], hunspell_output) -> int:
"""Parse `hunspell -a` output.
2020-11-23 13:26:34 +00:00
Print one line per error on stderr, of the following format:
FILE:LINE:ERROR
Returns the number of errors.
hunspell_input contains a dict of files: all_lines_for_this_file.
"""
errors = 0
2020-11-23 13:26:34 +00:00
checked_files = iter(hunspell_input.items())
checked_file_name, checked_text = next(checked_files)
checked_lines = iter(checked_text.split("\n"))
2020-11-23 13:26:34 +00:00
next(checked_lines)
current_line_number = 1
2020-11-23 13:26:34 +00:00
for line in hunspell_output.stdout.split("\n")[1:]:
if not line:
try:
2020-11-23 13:26:34 +00:00
next(checked_lines)
current_line_number += 1
except StopIteration:
try:
checked_file_name, checked_text = next(checked_files)
checked_lines = iter(checked_text.split("\n"))
2020-11-23 13:26:34 +00:00
next(checked_lines)
current_line_number = 1
except StopIteration:
return errors
continue
if line == "*": # OK
continue
if line[0] == "&":
2020-11-23 13:26:34 +00:00
_, original, *_ = line.split()
if look_like_a_word(original):
print(checked_file_name, current_line_number, original, sep=":")
errors += 1
2020-11-23 13:26:34 +00:00
raise Unreachable("Got this one? I'm sorry, read XKCD 2200, then open an issue.")
def gracefull_handling_of_missing_dicts(language):
2020-10-11 13:33:09 +00:00
"""Check if hunspell dictionary for given language is installed."""
hunspell_dash_d = subprocess.check_output(
["hunspell", "-D"], universal_newlines=True, stderr=subprocess.STDOUT
)
languages = {Path(line).name for line in hunspell_dash_d}
def error(*args, file=sys.stderr, **kwargs):
print(*args, file=file, **kwargs)
if language in languages:
return
error(
"The hunspell dictionary for your language is missing, please install it.",
end="\n\n",
)
if which("apt"):
error("Maybe try something like:")
error(" sudo apt install hunspell-{}".format(language))
else:
error(
"""I don't know your environment, but I bet the package name looks like:
hunspell-{language}
If you find it, please tell me (by opening an issue or a PR on
https://github.com/JulienPalard/pospell/) so I can enhance this error message.
""".format(
language=language
)
)
sys.exit(1)
def main():
2020-11-23 13:26:34 +00:00
"""Entry point (for command-line)."""
args = parse_args()
logging.basicConfig(level=50 - 10 * args.verbose)
default_drop_capitalized = DEFAULT_DROP_CAPITALIZED.get(args.language, False)
if args.drop_capitalized:
drop_capitalized = True
elif args.no_drop_capitalized:
drop_capitalized = False
else:
drop_capitalized = default_drop_capitalized
args.po_file = list(
chain(Path(".").glob(args.glob) if args.glob else [], args.po_file)
)
if args.modified:
git_status = subprocess.check_output(
["git", "status", "--porcelain"], encoding="utf-8"
)
git_status_lines = [
line.split(maxsplit=2) for line in git_status.split("\n") if line
]
args.po_file.extend(
Path(filename)
for status, filename in git_status_lines
if filename.endswith(".po")
)
try:
errors = spell_check(
args.po_file,
args.personal_dict,
args.language,
drop_capitalized,
args.debug,
)
except POSpellException as err:
print(err, file=sys.stderr)
sys.exit(-1)
if errors == -1:
gracefull_handling_of_missing_dicts(args.language)
sys.exit(0 if errors == 0 else -1)
2018-07-23 15:37:50 +00:00
if __name__ == "__main__":
2018-07-23 15:37:50 +00:00
main()