pospell/pospell.py

"""pospell is a spellcheckers for po files containing reStructuedText.
"""
from collections import defaultdict
import io
import logging
import subprocess
import sys
import tempfile
from contextlib import redirect_stderr
from itertools import chain
from pathlib import Path
from shutil import which

import docutils.frontend
import docutils.nodes
import docutils.parsers.rst
import polib
from docutils.parsers.rst import roles
from docutils.utils import new_document

import regex

__version__ = "1.0.5"

DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}


try:
    HUNSPELL_VERSION = subprocess.check_output(
        ["hunspell", "--version"], universal_newlines=True
    ).split("\n")[0]
except FileNotFoundError:
    print("hunspell not found, please install hunspell.", file=sys.stderr)
    exit(1)


class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
    pass


def monkey_patch_role(role):
    def role_or_generic(role_name, language_module, lineno, reporter):
        base_role, message = role(role_name, language_module, lineno, reporter)
        if base_role is None:
            roles.register_generic_role(role_name, DummyNodeClass)
            base_role, message = role(role_name, language_module, lineno, reporter)
        return base_role, message

    return role_or_generic


roles.role = monkey_patch_role(roles.role)


class NodeToTextVisitor(docutils.nodes.NodeVisitor):
    def __init__(self, document):
        self.output = []
        self.depth = 0
        super().__init__(document)

    def dispatch_visit(self, node):
        self.depth += 1
        super().dispatch_visit(node)

    def dispatch_departure(self, node):
        self.depth -= 1
        super().dispatch_departure(node)

    def unknown_visit(self, node):
        """Mandatory implementation to visit unknwon nodes."""
        # print(" " * self.depth * 4, node.__class__.__name__, ":", node)

    def unknown_departure(self, node):
        """To help debugging tree."""
        # print(node, repr(node), node.__class__.__name__)

    def visit_emphasis(self, node):
        raise docutils.nodes.SkipChildren

    def visit_superscript(self, node):
        raise docutils.nodes.SkipChildren

    def visit_title_reference(self, node):
        raise docutils.nodes.SkipChildren

    def visit_strong(self, node):
        raise docutils.nodes.SkipChildren

    def visit_DummyNodeClass(self, node):
        raise docutils.nodes.SkipChildren

    def visit_reference(self, node):
        raise docutils.nodes.SkipChildren

    def visit_literal(self, node):
        raise docutils.nodes.SkipChildren

    def visit_Text(self, node):
        self.output.append(node.rawsource)

    def __str__(self):
        return " ".join(self.output)


def strip_rst(line):
    if line.endswith("::"):
        # Drop :: at the end, it would cause Literal block expected
        line = line[:-2]
    parser = docutils.parsers.rst.Parser()
    components = (docutils.parsers.rst.Parser,)
    settings = docutils.frontend.OptionParser(
        components=components
    ).get_default_values()
    stderr_stringio = io.StringIO()
    with redirect_stderr(stderr_stringio):
        document = new_document("<rst-doc>", settings=settings)
        parser.parse(line, document)
    stderr = stderr_stringio.getvalue()
    if stderr:
        print(stderr.strip(), "while parsing:", line)
    visitor = NodeToTextVisitor(document)
    document.walk(visitor)
    return str(visitor)


def clear(line, drop_capitalized=False, po_path=""):
    """Clear various other syntaxes we may encounter in a line."""
    # Normalize spaces
    line = regex.sub(r"\s+", " ", line).replace("\xad", "")

    to_drop = {
        r'<a href="[^"]*?">',
        # Strip accronyms
        r"\b[\w-]*\p{Uppercase}{2,}[0-9.\w-]*\b",
        r"---?",  # -- and --- separators to be ignored
        r" - ",  # Drop lone dashes (sometimes used in place of -- or ---)
        r"-\\ ",  # Ignore "MINUS BACKSLASH SPACE" typically used in
        # formulas, like '-\ *π*' but *π* gets removed too
        r"{[a-z_]*?}",  # Sphinx variable
        r"'?-?\b([0-9]+\.)*[0-9]+\.[0-9abcrx]+\b'?",  # Versions
        r"[0-9]+h",  # Hours
        r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]",  # Sphinx variable
        r"« . »",  # Single letter examples (typically in Unicode documentation)
    }
    if drop_capitalized:
        to_drop.add(
            # Strip capitalized words in sentences
            r"(?<!\. |^|-)\b(\p{Letter}['’])?\b\p{Uppercase}\p{Letter}[\w.-]*\b"
        )
    if logging.getLogger().isEnabledFor(logging.DEBUG):
        for pattern in to_drop:
            for dropped in regex.findall(pattern, line):
                logging.debug(
                    "%s: dropping %r via %r due to from %r",
                    po_path,
                    dropped,
                    pattern,
                    line,
                )
    return regex.sub("|".join(to_drop), r" ", line)


def po_to_text(po_path, drop_capitalized=False):
    """Converts a po file to a text file, by stripping the msgids and all
    po syntax, but by keeping the kept lines at their same position /
    line number.
    """
    buffer = []
    lines = 0
    entries = polib.pofile(po_path)
    for entry in entries:
        if entry.msgid == entry.msgstr:
            continue
        while lines < entry.linenum:
            buffer.append("")
            lines += 1
        buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
        lines += 1
    return "\n".join(buffer)


def parse_args():
    """Parse command line arguments."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Check spelling in po files containing restructuredText."
    )
    parser.add_argument(
        "-l",
        "--language",
        type=str,
        default="fr",
        help="Language to check, you'll have to install the corresponding "
        "hunspell dictionary, on Debian see apt list 'hunspell-*'.",
    )
    parser.add_argument(
        "--glob",
        type=str,
        help="Provide a glob pattern, to be interpreted by pospell, to find po files, "
        "like --glob '**/*.po'.",
    )
    parser.add_argument(
        "--drop-capitalized",
        action="store_true",
        help="Always drop capitalized words in sentences (defaults according to the language).",
    )
    parser.add_argument(
        "--no-drop-capitalized",
        action="store_true",
        help="Never drop capitalized words in sentences (defaults according to the language).",
    )
    parser.add_argument(
        "po_file",
        nargs="*",
        type=Path,
        help="Files to check, can optionally be mixed with --glob, or not, "
        "use the one that fit your needs.",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="count",
        default=0,
        help="More output, use -vv, -vvv, and so on.",
    )
    parser.add_argument(
        "--version",
        action="version",
        version="%(prog)s " + __version__ + " using hunspell: " + HUNSPELL_VERSION,
    )
    parser.add_argument("--debug", action="store_true")
    parser.add_argument("-p", "--personal-dict", type=str)
    parser.add_argument(
        "--modified", "-m", action="store_true", help="Use git to find modified files."
    )
    args = parser.parse_args()
    if args.drop_capitalized and args.no_drop_capitalized:
        print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
        parser.print_help()
        exit(1)
    if not args.po_file and not args.modified:
        parser.print_help()
        exit(1)
    return args


def spell_check(
    po_files,
    personal_dict=None,
    language="en_EN",
    drop_capitalized=False,
    debug_only=False,
):
    """Check for spelling mistakes in the files po_files (po format,
    containing restructuredtext), for the given language.
    personal_dict allow to pass a personal dict (-p) option, to hunspell.

    Debug only will show what's passed to Hunspell instead of passing it.
    """
    errors = []
    personal_dict_arg = ["-p", personal_dict] if personal_dict else []
    for po_file in po_files:
        if debug_only:
            print(po_to_text(str(po_file), drop_capitalized))
            continue
        text_for_hunspell = po_to_text(str(po_file), drop_capitalized)
        try:
            output = subprocess.run(
                ["hunspell", "-d", language, "-l"] + personal_dict_arg,
                universal_newlines=True,
                input=text_for_hunspell,
                stdout=subprocess.PIPE,
            )
        except subprocess.CalledProcessError:
            return -1
        if not output.stdout:
            continue  # No errors :)
        line_of_words = defaultdict(set)
        for misspelled_word in {word for word in output.stdout.split("\n") if word}:
            for line_number, line in enumerate(text_for_hunspell.split("\n"), start=1):
                if misspelled_word in line:
                    errors.append((po_file, line_number, misspelled_word))
    errors.sort()
    for error in errors:
        print(":".join(str(token) for token in error))
    return len(errors)


def gracefull_handling_of_missing_dicts(language):
    """Check if hunspell dictionary for given language is installed."""
    hunspell_dash_d = subprocess.check_output(
        ["hunspell", "-D"], universal_newlines=True, stderr=subprocess.STDOUT
    )
    languages = {Path(line).name for line in hunspell_dash_d}

    def error(*args, file=sys.stderr, **kwargs):
        print(*args, file=file, **kwargs)

    if language in languages:
        return
    error(
        "The hunspell dictionary for your language is missing, please install it.",
        end="\n\n",
    )
    if which("apt"):
        error("Maybe try something like:")
        error("  sudo apt install hunspell-{}".format(language))
    else:
        error(
            """I don't know your environment, but I bet the package name looks like:

    hunspell-{language}

If you find it, please tell me (by opening an issue or a PR on
https://github.com/JulienPalard/pospell/) so I can enhance this error message.
""".format(
                language=language
            )
        )
    exit(1)


def main():
    """Module entry point."""
    args = parse_args()
    logging.basicConfig(level=50 - 10 * args.verbose)
    default_drop_capitalized = DEFAULT_DROP_CAPITALIZED.get(args.language, False)
    if args.drop_capitalized:
        drop_capitalized = True
    elif args.no_drop_capitalized:
        drop_capitalized = False
    else:
        drop_capitalized = default_drop_capitalized
    args.po_file = list(
        chain(Path(".").glob(args.glob) if args.glob else [], args.po_file)
    )
    if args.modified:
        git_status = subprocess.check_output(
            ["git", "status", "--porcelain"], encoding="utf-8"
        )
        git_status_lines = [
            line.split(maxsplit=2) for line in git_status.split("\n") if line
        ]
        args.po_file.extend(
            Path(filename)
            for status, filename in git_status_lines
            if filename.endswith(".po")
        )
    errors = spell_check(
        args.po_file, args.personal_dict, args.language, drop_capitalized, args.debug
    )
    if errors == -1:
        gracefull_handling_of_missing_dicts(args.language)
    exit(0 if errors == 0 else -1)


if __name__ == "__main__":
    main()