pospell/pospell.py

356 lines
11 KiB
Python
Raw Normal View History

2018-07-27 13:49:58 +00:00
"""pospell is a spellcheckers for po files containing reStructuedText.
"""
2018-07-27 19:57:44 +00:00
import io
2018-07-31 22:20:03 +00:00
import logging
2018-07-23 15:37:50 +00:00
import subprocess
2018-07-28 22:58:20 +00:00
import sys
2018-07-27 09:38:17 +00:00
import tempfile
2019-08-20 14:38:03 +00:00
from contextlib import redirect_stderr
from itertools import chain
2018-07-23 15:37:50 +00:00
from pathlib import Path
from shutil import which
2018-07-27 09:38:17 +00:00
2018-07-27 19:57:44 +00:00
import docutils.frontend
import docutils.nodes
import docutils.parsers.rst
2018-07-28 22:58:20 +00:00
import polib
2018-07-27 19:57:44 +00:00
from docutils.parsers.rst import roles
from docutils.utils import new_document
2018-07-31 22:20:03 +00:00
import regex
2019-10-17 07:46:30 +00:00
__version__ = "1.0.3"
DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
2018-07-28 22:58:20 +00:00
try:
HUNSPELL_VERSION = subprocess.check_output(
["hunspell", "--version"], universal_newlines=True
).split("\n")[0]
except FileNotFoundError:
print("hunspell not found, please install hunspell.", file=sys.stderr)
exit(1)
2018-07-27 19:57:44 +00:00
class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
pass
def monkey_patch_role(role):
def role_or_generic(role_name, language_module, lineno, reporter):
base_role, message = role(role_name, language_module, lineno, reporter)
if base_role is None:
roles.register_generic_role(role_name, DummyNodeClass)
base_role, message = role(role_name, language_module, lineno, reporter)
return base_role, message
return role_or_generic
roles.role = monkey_patch_role(roles.role)
class NodeToTextVisitor(docutils.nodes.NodeVisitor):
def __init__(self, document):
self.output = []
2019-07-26 15:40:48 +00:00
self.depth = 0
2018-07-27 19:57:44 +00:00
super().__init__(document)
2019-07-26 15:40:48 +00:00
def dispatch_visit(self, node):
self.depth += 1
super().dispatch_visit(node)
def dispatch_departure(self, node):
self.depth -= 1
super().dispatch_departure(node)
2018-07-27 19:57:44 +00:00
def unknown_visit(self, node):
2019-07-26 15:40:48 +00:00
"""Mandatory implementation to visit unknwon nodes.
"""
# print(" " * self.depth * 4, node.__class__.__name__, ":", node)
def unknown_departure(self, node):
"""To help debugging tree.
"""
# print(node, repr(node), node.__class__.__name__)
def visit_emphasis(self, node):
raise docutils.nodes.SkipChildren
def visit_superscript(self, node):
raise docutils.nodes.SkipChildren
def visit_title_reference(self, node):
raise docutils.nodes.SkipChildren
def visit_strong(self, node):
raise docutils.nodes.SkipChildren
def visit_DummyNodeClass(self, node):
raise docutils.nodes.SkipChildren
def visit_reference(self, node):
raise docutils.nodes.SkipChildren
def visit_literal(self, node):
raise docutils.nodes.SkipChildren
2018-07-27 19:57:44 +00:00
def visit_Text(self, node):
self.output.append(node.rawsource)
def __str__(self):
return " ".join(self.output)
2018-07-23 15:37:50 +00:00
def strip_rst(line):
2018-07-27 19:57:44 +00:00
if line.endswith("::"):
# Drop :: at the end, it would cause Literal block expected
line = line[:-2]
parser = docutils.parsers.rst.Parser()
components = (docutils.parsers.rst.Parser,)
settings = docutils.frontend.OptionParser(
components=components
).get_default_values()
stderr_stringio = io.StringIO()
with redirect_stderr(stderr_stringio):
2019-08-20 14:38:03 +00:00
document = new_document("<rst-doc>", settings=settings)
2018-07-27 19:57:44 +00:00
parser.parse(line, document)
stderr = stderr_stringio.getvalue()
if stderr:
print(stderr.strip(), "while parsing:", line)
visitor = NodeToTextVisitor(document)
document.walk(visitor)
return str(visitor)
2018-07-23 15:37:50 +00:00
def clear(po_path, line, drop_capitalized=False):
"""Clear various other syntaxes we may encounter in a line.
"""
# Normalize spaces
line = regex.sub(r"\s+", " ", line)
2018-07-31 22:20:03 +00:00
to_drop = {
r'<a href="[^"]*?">',
# Strip accronyms
2019-10-16 15:51:50 +00:00
r"\b[\w-]*\p{Uppercase}{2,}[0-9.\w-]*\b",
2018-07-31 22:20:03 +00:00
r"---?", # -- and --- separators to be ignored
2019-10-16 15:03:15 +00:00
r" - ", # Drop lone dashes (sometimes used in place of -- or ---)
2018-07-31 22:20:03 +00:00
r"-\\ ", # Ignore "MINUS BACKSLASH SPACE" typically used in
# formulas, like '-\ *π*' but *π* gets removed too
2019-11-16 13:47:22 +00:00
r"{[a-z_]*?}", # Sphinx variable
r"'?-?\b([0-9]+\.)*[0-9]+\.[0-9abcrx]+\b'?", # Versions
2019-05-23 18:45:04 +00:00
r"[0-9]+h", # Hours
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
r"« . »", # Single letter examples (typically in Unicode documentation)
2019-09-30 11:37:48 +00:00
"\xad", # soft hyphen
2018-07-31 22:20:03 +00:00
}
if drop_capitalized:
to_drop.add(
# Strip capitalized words in sentences
r"(?<!\. |^|-)\b(\p{Letter}['])?\b\p{Uppercase}\p{Letter}[\w.-]*\b"
)
2018-07-31 22:20:03 +00:00
if logging.getLogger().isEnabledFor(logging.DEBUG):
for pattern in to_drop:
for dropped in regex.findall(pattern, line):
logging.debug("%s: dropping %r due to from %r", po_path, dropped, line)
return regex.sub("|".join(to_drop), r"", line)
def po_to_text(po_path, drop_capitalized=False):
"""Converts a po file to a text file, by stripping the msgids and all
po syntax, but by keeping the kept lines at their same position /
line number.
"""
2018-07-23 15:37:50 +00:00
buffer = []
2018-07-23 17:24:10 +00:00
lines = 0
entries = polib.pofile(po_path)
2018-07-23 15:37:50 +00:00
for entry in entries:
if entry.msgid == entry.msgstr:
continue
2018-07-23 17:24:10 +00:00
while lines < entry.linenum:
buffer.append("")
2018-07-23 17:24:10 +00:00
lines += 1
2019-10-09 11:07:09 +00:00
buffer.append(clear(po_path, strip_rst(entry.msgstr), drop_capitalized))
2018-07-23 17:24:10 +00:00
lines += 1
return "\n".join(buffer)
2018-07-23 15:37:50 +00:00
2018-07-28 22:58:20 +00:00
def parse_args():
"""Parse command line arguments.
"""
2018-07-23 15:37:50 +00:00
import argparse
2018-07-23 15:37:50 +00:00
parser = argparse.ArgumentParser(
description="Check spelling in po files containing restructuredText."
)
parser.add_argument(
"-l",
"--language",
type=str,
default="fr",
help="Language to check, you'll have to install the corresponding "
"hunspell dictionary, on Debian see apt list 'hunspell-*'.",
)
parser.add_argument(
"--glob",
type=str,
help="Provide a glob pattern, to be interpreted by pospell, to find po files, "
"like --glob '**/*.po'.",
)
2019-10-09 11:07:09 +00:00
parser.add_argument(
"--drop-capitalized",
2019-10-09 11:07:09 +00:00
action="store_true",
help="Always drop capitalized words in sentences (defaults according to the language).",
)
parser.add_argument(
"--no-drop-capitalized",
action="store_true",
help="Never drop capitalized words in sentences (defaults according to the language).",
2019-10-09 11:07:09 +00:00
)
parser.add_argument(
"po_file",
nargs="*",
type=Path,
help="Files to check, can optionally be mixed with --glob, or not, "
"use the one that fit your needs.",
)
2018-07-31 22:20:03 +00:00
parser.add_argument(
"-v",
"--verbose",
action="count",
default=0,
help="More output, use -vv, -vvv, and so on.",
)
2018-07-28 22:58:20 +00:00
parser.add_argument(
"--version",
action="version",
version="%(prog)s " + __version__ + " using hunspell: " + HUNSPELL_VERSION,
)
parser.add_argument("--debug", action="store_true")
parser.add_argument("-p", "--personal-dict", type=str)
parser.add_argument(
"--modified", "-m", action="store_true", help="Use git to find modified files."
)
args = parser.parse_args()
if args.drop_capitalized and args.no_drop_capitalized:
print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
parser.print_help()
exit(1)
if not args.po_file and not args.modified:
parser.print_help()
exit(1)
return args
2018-07-28 22:58:20 +00:00
def spell_check(
po_files, personal_dict, language, drop_capitalized=False, debug_only=False
):
"""Check for spelling mistakes in the files po_files (po format,
containing restructuredtext), for the given language.
personal_dict allow to pass a personal dict (-p) option, to hunspell.
Debug only will show what's passed to Hunspell instead of passing it.
2018-07-28 22:58:20 +00:00
"""
2018-07-23 22:28:13 +00:00
errors = 0
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
2018-07-23 15:37:50 +00:00
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdir = Path(tmpdirname)
for po_file in po_files:
if debug_only:
2019-10-09 11:07:09 +00:00
print(po_to_text(str(po_file), drop_capitalized))
continue
(tmpdir / po_file.name).write_text(
po_to_text(str(po_file), drop_capitalized)
)
try:
output = subprocess.check_output(
["hunspell", "-d", language]
+ personal_dict_arg
+ ["-u3", str(tmpdir / po_file.name)],
universal_newlines=True,
)
except subprocess.CalledProcessError:
return -1
for line in output.split("\n"):
2018-07-31 22:20:03 +00:00
match = regex.match(
r"(?P<path>.*):(?P<line>[0-9]+): Locate: (?P<error>.*) \| Try: .*$",
line,
)
2018-07-23 17:24:10 +00:00
if match:
2018-07-23 22:28:13 +00:00
errors += 1
print(po_file, match.group("line"), match.group("error"), sep=":")
return errors
def gracefull_handling_of_missing_dicts(language):
"""Check if hunspell dictionary for given language is installed.
"""
hunspell_dash_d = subprocess.check_output(
["hunspell", "-D"], universal_newlines=True, stderr=subprocess.STDOUT
)
languages = {Path(line).name for line in hunspell_dash_d}
def error(*args, file=sys.stderr, **kwargs):
print(*args, file=file, **kwargs)
if language in languages:
return
error(
"The hunspell dictionary for your language is missing, please install it.",
end="\n\n",
)
if which("apt"):
error("Maybe try something like:")
error(" sudo apt install hunspell-{}".format(language))
else:
error(
"""I don't know your environment, but I bet the package name looks like:
hunspell-{language}
If you find it, please tell me (by opening an issue or a PR on
https://github.com/JulienPalard/pospell/) so I can enhance this error message.
""".format(
language=language
)
)
exit(1)
def main():
"""Module entry point.
"""
args = parse_args()
logging.basicConfig(level=50 - 10 * args.verbose)
default_drop_capitalized = DEFAULT_DROP_CAPITALIZED.get(args.language, False)
if args.drop_capitalized:
drop_capitalized = True
elif args.no_drop_capitalized:
drop_capitalized = False
else:
drop_capitalized = default_drop_capitalized
args.po_file = list(
chain(Path(".").glob(args.glob) if args.glob else [], args.po_file)
)
if args.modified:
git_status = subprocess.check_output(
["git", "status", "--porcelain"], encoding="utf-8"
)
git_status_lines = [
line.split(maxsplit=2) for line in git_status.split("\n") if line
]
args.po_file.extend(
Path(filename)
for status, filename in git_status_lines
if filename.endswith(".po")
)
2019-10-09 11:07:09 +00:00
errors = spell_check(
args.po_file, args.personal_dict, args.language, drop_capitalized, args.debug
2019-10-09 11:07:09 +00:00
)
if errors == -1:
gracefull_handling_of_missing_dicts(args.language)
2018-07-23 22:28:13 +00:00
exit(0 if errors == 0 else -1)
2018-07-23 15:37:50 +00:00
if __name__ == "__main__":
2018-07-23 15:37:50 +00:00
main()