pospell/pospell.py

360 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""pospell is a spellcheckers for po files containing reStructuedText.
"""
from collections import defaultdict
import io
import logging
import subprocess
import sys
import tempfile
from contextlib import redirect_stderr
from itertools import chain
from pathlib import Path
from shutil import which
import docutils.frontend
import docutils.nodes
import docutils.parsers.rst
import polib
from docutils.parsers.rst import roles
from docutils.utils import new_document
import regex
__version__ = "1.0.5"
DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
try:
HUNSPELL_VERSION = subprocess.check_output(
["hunspell", "--version"], universal_newlines=True
).split("\n")[0]
except FileNotFoundError:
print("hunspell not found, please install hunspell.", file=sys.stderr)
exit(1)
class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
pass
def monkey_patch_role(role):
def role_or_generic(role_name, language_module, lineno, reporter):
base_role, message = role(role_name, language_module, lineno, reporter)
if base_role is None:
roles.register_generic_role(role_name, DummyNodeClass)
base_role, message = role(role_name, language_module, lineno, reporter)
return base_role, message
return role_or_generic
roles.role = monkey_patch_role(roles.role)
class NodeToTextVisitor(docutils.nodes.NodeVisitor):
def __init__(self, document):
self.output = []
self.depth = 0
super().__init__(document)
def dispatch_visit(self, node):
self.depth += 1
super().dispatch_visit(node)
def dispatch_departure(self, node):
self.depth -= 1
super().dispatch_departure(node)
def unknown_visit(self, node):
"""Mandatory implementation to visit unknwon nodes."""
# print(" " * self.depth * 4, node.__class__.__name__, ":", node)
def unknown_departure(self, node):
"""To help debugging tree."""
# print(node, repr(node), node.__class__.__name__)
def visit_emphasis(self, node):
raise docutils.nodes.SkipChildren
def visit_superscript(self, node):
raise docutils.nodes.SkipChildren
def visit_title_reference(self, node):
raise docutils.nodes.SkipChildren
def visit_strong(self, node):
raise docutils.nodes.SkipChildren
def visit_DummyNodeClass(self, node):
raise docutils.nodes.SkipChildren
def visit_reference(self, node):
raise docutils.nodes.SkipChildren
def visit_literal(self, node):
raise docutils.nodes.SkipChildren
def visit_Text(self, node):
self.output.append(node.rawsource)
def __str__(self):
return " ".join(self.output)
def strip_rst(line):
if line.endswith("::"):
# Drop :: at the end, it would cause Literal block expected
line = line[:-2]
parser = docutils.parsers.rst.Parser()
components = (docutils.parsers.rst.Parser,)
settings = docutils.frontend.OptionParser(
components=components
).get_default_values()
stderr_stringio = io.StringIO()
with redirect_stderr(stderr_stringio):
document = new_document("<rst-doc>", settings=settings)
parser.parse(line, document)
stderr = stderr_stringio.getvalue()
if stderr:
print(stderr.strip(), "while parsing:", line)
visitor = NodeToTextVisitor(document)
document.walk(visitor)
return str(visitor)
def clear(line, drop_capitalized=False, po_path=""):
"""Clear various other syntaxes we may encounter in a line."""
# Normalize spaces
line = regex.sub(r"\s+", " ", line).replace("\xad", "")
to_drop = {
r'<a href="[^"]*?">',
# Strip accronyms
r"\b[\w-]*\p{Uppercase}{2,}[0-9.\w-]*\b",
r"---?", # -- and --- separators to be ignored
r" - ", # Drop lone dashes (sometimes used in place of -- or ---)
r"-\\ ", # Ignore "MINUS BACKSLASH SPACE" typically used in
# formulas, like '-\ *π*' but *π* gets removed too
r"{[a-z_]*?}", # Sphinx variable
r"'?-?\b([0-9]+\.)*[0-9]+\.[0-9abcrx]+\b'?", # Versions
r"[0-9]+h", # Hours
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
r"« . »", # Single letter examples (typically in Unicode documentation)
}
if drop_capitalized:
to_drop.add(
# Strip capitalized words in sentences
r"(?<!\. |^|-)\b(\p{Letter}['])?\b\p{Uppercase}\p{Letter}[\w.-]*\b"
)
if logging.getLogger().isEnabledFor(logging.DEBUG):
for pattern in to_drop:
for dropped in regex.findall(pattern, line):
logging.debug(
"%s: dropping %r via %r due to from %r",
po_path,
dropped,
pattern,
line,
)
return regex.sub("|".join(to_drop), r" ", line)
def po_to_text(po_path, drop_capitalized=False):
"""Converts a po file to a text file, by stripping the msgids and all
po syntax, but by keeping the kept lines at their same position /
line number.
"""
buffer = []
lines = 0
entries = polib.pofile(po_path)
for entry in entries:
if entry.msgid == entry.msgstr:
continue
while lines < entry.linenum:
buffer.append("")
lines += 1
buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
lines += 1
return "\n".join(buffer)
def parse_args():
"""Parse command line arguments."""
import argparse
parser = argparse.ArgumentParser(
description="Check spelling in po files containing restructuredText."
)
parser.add_argument(
"-l",
"--language",
type=str,
default="fr",
help="Language to check, you'll have to install the corresponding "
"hunspell dictionary, on Debian see apt list 'hunspell-*'.",
)
parser.add_argument(
"--glob",
type=str,
help="Provide a glob pattern, to be interpreted by pospell, to find po files, "
"like --glob '**/*.po'.",
)
parser.add_argument(
"--drop-capitalized",
action="store_true",
help="Always drop capitalized words in sentences (defaults according to the language).",
)
parser.add_argument(
"--no-drop-capitalized",
action="store_true",
help="Never drop capitalized words in sentences (defaults according to the language).",
)
parser.add_argument(
"po_file",
nargs="*",
type=Path,
help="Files to check, can optionally be mixed with --glob, or not, "
"use the one that fit your needs.",
)
parser.add_argument(
"-v",
"--verbose",
action="count",
default=0,
help="More output, use -vv, -vvv, and so on.",
)
parser.add_argument(
"--version",
action="version",
version="%(prog)s " + __version__ + " using hunspell: " + HUNSPELL_VERSION,
)
parser.add_argument("--debug", action="store_true")
parser.add_argument("-p", "--personal-dict", type=str)
parser.add_argument(
"--modified", "-m", action="store_true", help="Use git to find modified files."
)
args = parser.parse_args()
if args.drop_capitalized and args.no_drop_capitalized:
print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
parser.print_help()
exit(1)
if not args.po_file and not args.modified:
parser.print_help()
exit(1)
return args
def spell_check(
po_files,
personal_dict=None,
language="en_EN",
drop_capitalized=False,
debug_only=False,
):
"""Check for spelling mistakes in the files po_files (po format,
containing restructuredtext), for the given language.
personal_dict allow to pass a personal dict (-p) option, to hunspell.
Debug only will show what's passed to Hunspell instead of passing it.
"""
errors = []
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
for po_file in po_files:
if debug_only:
print(po_to_text(str(po_file), drop_capitalized))
continue
text_for_hunspell = po_to_text(str(po_file), drop_capitalized)
try:
output = subprocess.run(
["hunspell", "-d", language, "-l"] + personal_dict_arg,
universal_newlines=True,
input=text_for_hunspell,
stdout=subprocess.PIPE,
)
except subprocess.CalledProcessError:
return -1
if not output.stdout:
continue # No errors :)
line_of_words = defaultdict(set)
for misspelled_word in {word for word in output.stdout.split("\n") if word}:
for line_number, line in enumerate(text_for_hunspell.split("\n"), start=1):
if misspelled_word in line:
errors.append((po_file, line_number, misspelled_word))
errors.sort()
for error in errors:
print(":".join(str(token) for token in error))
return len(errors)
def gracefull_handling_of_missing_dicts(language):
"""Check if hunspell dictionary for given language is installed."""
hunspell_dash_d = subprocess.check_output(
["hunspell", "-D"], universal_newlines=True, stderr=subprocess.STDOUT
)
languages = {Path(line).name for line in hunspell_dash_d}
def error(*args, file=sys.stderr, **kwargs):
print(*args, file=file, **kwargs)
if language in languages:
return
error(
"The hunspell dictionary for your language is missing, please install it.",
end="\n\n",
)
if which("apt"):
error("Maybe try something like:")
error(" sudo apt install hunspell-{}".format(language))
else:
error(
"""I don't know your environment, but I bet the package name looks like:
hunspell-{language}
If you find it, please tell me (by opening an issue or a PR on
https://github.com/JulienPalard/pospell/) so I can enhance this error message.
""".format(
language=language
)
)
exit(1)
def main():
"""Module entry point."""
args = parse_args()
logging.basicConfig(level=50 - 10 * args.verbose)
default_drop_capitalized = DEFAULT_DROP_CAPITALIZED.get(args.language, False)
if args.drop_capitalized:
drop_capitalized = True
elif args.no_drop_capitalized:
drop_capitalized = False
else:
drop_capitalized = default_drop_capitalized
args.po_file = list(
chain(Path(".").glob(args.glob) if args.glob else [], args.po_file)
)
if args.modified:
git_status = subprocess.check_output(
["git", "status", "--porcelain"], encoding="utf-8"
)
git_status_lines = [
line.split(maxsplit=2) for line in git_status.split("\n") if line
]
args.po_file.extend(
Path(filename)
for status, filename in git_status_lines
if filename.endswith(".po")
)
errors = spell_check(
args.po_file, args.personal_dict, args.language, drop_capitalized, args.debug
)
if errors == -1:
gracefull_handling_of_missing_dicts(args.language)
exit(0 if errors == 0 else -1)
if __name__ == "__main__":
main()