pospell/pospell.py

"""pospell is a spellcheckers for po files containing reStructuedText."""
import io
from string import digits
from unicodedata import category
import logging
import subprocess
import sys
from typing import Dict
from contextlib import redirect_stderr
from itertools import chain
from pathlib import Path
from shutil import which

import docutils.frontend
import docutils.nodes
import docutils.parsers.rst
import polib
from docutils.parsers.rst import roles
from docutils.utils import new_document

import regex

__version__ = "1.0.11"

DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}


class POSpellException(Exception):
    """All exceptions from this module inherit from this one."""


class Unreachable(POSpellException):
    """The code encontered a state that should be unreachable."""


try:
    HUNSPELL_VERSION = subprocess.check_output(
        ["hunspell", "--version"], universal_newlines=True
    ).split("\n")[0]
except FileNotFoundError:
    print("hunspell not found, please install hunspell.", file=sys.stderr)
    sys.exit(1)


class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
    """Used to represent any unknown roles, so we can parse any rst blindly."""


def monkey_patch_role(role):
    """Patch docutils.parsers.rst.roles.role so it always match.

    Giving a DummyNodeClass for unknown roles.
    """

    def role_or_generic(role_name, language_module, lineno, reporter):
        base_role, message = role(role_name, language_module, lineno, reporter)
        if base_role is None:
            roles.register_generic_role(role_name, DummyNodeClass)
            base_role, message = role(role_name, language_module, lineno, reporter)
        return base_role, message

    return role_or_generic


roles.role = monkey_patch_role(roles.role)


class NodeToTextVisitor(docutils.nodes.NodeVisitor):
    """Recursively convert a docutils node to a Python string.

    Usage:

    >>> visitor = NodeToTextVisitor(document)
    >>> document.walk(visitor)
    >>> print(str(visitor))

    It ignores (see IGNORE_LIST) some nodes, which we don't want in
    hunspell (enphasis typically contain proper names that are unknown
    to dictionaires).
    """

    IGNORE_LIST = (
        "emphasis",
        "superscript",
        "title_reference",
        "strong",
        "DummyNodeClass",
        "reference",
        "literal",
        "Text",
    )

    def __init__(self, document):
        """Initialize visitor for the given node/document."""
        self.output = []
        super().__init__(document)

    def unknown_visit(self, node):
        """Mandatory implementation to visit unknwon nodes."""

    @staticmethod
    def ignore(node):
        """Just raise SkipChildren.

        Used for all visit_* in the IGNORE_LIST.

        See __getattr__.
        """
        raise docutils.nodes.SkipChildren

    def __getattr__(self, name):
        """Skip childrens from the IGNORE_LIST."""
        if name.startswith("visit_") and name[6:] in self.IGNORE_LIST:
            return self.ignore
        raise AttributeError(name)

    def visit_Text(self, node):
        """Keep this node text, this is typically what we want to spell check."""
        self.output.append(node.rawsource)

    def __str__(self):
        """Give the accumulated strings."""
        return " ".join(self.output)


def strip_rst(line):
    """Transform reStructuredText to plain text."""
    if line.endswith("::"):
        # Drop :: at the end, it would cause Literal block expected
        line = line[:-2]
    parser = docutils.parsers.rst.Parser()
    settings = docutils.frontend.Values(
        {
            "report_level": 2,
            "halt_level": 4,
            "exit_status_level": 5,
            "debug": None,
            "warning_stream": None,
            "error_encoding": "utf-8",
            "error_encoding_error_handler": "backslashreplace",
            "language_code": "en",
            "id_prefix": "",
            "auto_id_prefix": "id",
            "pep_references": None,
            "pep_base_url": "http://www.python.org/dev/peps/",
            "pep_file_url_template": "pep-%04d",
            "rfc_references": None,
            "rfc_base_url": "http://tools.ietf.org/html/",
            "tab_width": 8,
            "trim_footnote_reference_space": None,
            "syntax_highlight": "long",
        }
    )
    stderr_stringio = io.StringIO()
    with redirect_stderr(stderr_stringio):
        document = new_document("<rst-doc>", settings=settings)
        parser.parse(line, document)
    stderr = stderr_stringio.getvalue()
    if stderr:
        print(stderr.strip(), "while parsing:", line)
    visitor = NodeToTextVisitor(document)
    document.walk(visitor)
    return str(visitor)


def clear(line, drop_capitalized=False, po_path=""):
    """Clear various other syntaxes we may encounter in a line."""
    # Normalize spaces
    line = regex.sub(r"\s+", " ", line).replace("\xad", "")

    to_drop = {
        r'<a href="[^"]*?">',
        r"{[a-z_]*?}",  # Sphinx variable
        r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]",  # Sphinx variable
        r"« . »",  # Single letter examples (typically in Unicode documentation)
    }
    if drop_capitalized:
        to_drop.add(
            # Strip capitalized words in sentences
            r"(?<!\. |^|-)\b(\p{Letter}['’])?\b\p{Uppercase}\p{Letter}[\w.-]*\b"
        )
    if logging.getLogger().isEnabledFor(logging.DEBUG):
        for pattern in to_drop:
            for dropped in regex.findall(pattern, line):
                logging.debug(
                    "%s: dropping %r via %r due to from %r",
                    po_path,
                    dropped,
                    pattern,
                    line,
                )
    return regex.sub("|".join(to_drop), r" ", line)


def quote_for_hunspell(text):
    """Quote a paragraph so hunspell don't misinterpret it.

    Quoting the manpage:
    It is recommended that programmatic interfaces prefix
    every data line with an uparrow to protect themselves
    against future changes in hunspell.
    """
    out = []
    for line in text.split("\n"):
        out.append("^" + line if line else "")
    return "\n".join(out)


def po_to_text(po_path, drop_capitalized=False):
    """Convert a po file to a text file.

    This strips the msgids and all po syntax while keeping lines at
    their same position / line number.
    """
    buffer = []
    lines = 0
    try:
        entries = polib.pofile(Path(po_path).read_text())
    except Exception as err:
        raise POSpellException(str(err)) from err
    for entry in entries:
        if entry.msgid == entry.msgstr:
            continue
        while lines < entry.linenum:
            buffer.append("")
            lines += 1
        buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
        lines += 1
    return "\n".join(buffer)


def parse_args():
    """Parse command line arguments."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Check spelling in po files containing restructuredText."
    )
    parser.add_argument(
        "-l",
        "--language",
        type=str,
        default="fr",
        help="Language to check, you'll have to install the corresponding "
        "hunspell dictionary, on Debian see apt list 'hunspell-*'.",
    )
    parser.add_argument(
        "--glob",
        type=str,
        help="Provide a glob pattern, to be interpreted by pospell, to find po files, "
        "like --glob '**/*.po'.",
    )
    parser.add_argument(
        "--drop-capitalized",
        action="store_true",
        help="Always drop capitalized words in sentences"
        " (defaults according to the language).",
    )
    parser.add_argument(
        "--no-drop-capitalized",
        action="store_true",
        help="Never drop capitalized words in sentences"
        " (defaults according to the language).",
    )
    parser.add_argument(
        "po_file",
        nargs="*",
        type=Path,
        help="Files to check, can optionally be mixed with --glob, or not, "
        "use the one that fit your needs.",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="count",
        default=0,
        help="More output, use -vv, -vvv, and so on.",
    )
    parser.add_argument(
        "--version",
        action="version",
        version="%(prog)s " + __version__ + " using hunspell: " + HUNSPELL_VERSION,
    )
    parser.add_argument("--debug", action="store_true")
    parser.add_argument("-p", "--personal-dict", type=str)
    parser.add_argument(
        "--modified", "-m", action="store_true", help="Use git to find modified files."
    )
    args = parser.parse_args()
    if args.drop_capitalized and args.no_drop_capitalized:
        print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
        parser.print_help()
        sys.exit(1)
    if not args.po_file and not args.modified:
        parser.print_help()
        sys.exit(1)
    return args


def look_like_a_word(word):
    """Return True if the given str looks like a word.

    Used to filter out non-words like `---` or `-0700` so they don't
    get reported. They typically are not errors.
    """
    if not word:
        return False
    if any(digit in word for digit in digits):
        return False
    if len([c for c in word if category(c) == "Lu"]) > 1:
        return False  # Probably an accronym, or a name like CPython, macOS, SQLite, ...
    if "-" in word:
        return False
    return True


def spell_check(
    po_files,
    personal_dict=None,
    language="en_US",
    drop_capitalized=False,
    debug_only=False,
):
    """Check for spelling mistakes in the given po_files.

    (po format, containing restructuredtext), for the given language.
    personal_dict allow to pass a personal dict (-p) option, to hunspell.

    Debug only will show what's passed to Hunspell instead of passing it.
    """
    personal_dict_arg = ["-p", personal_dict] if personal_dict else []
    texts_for_hunspell = {}
    for po_file in po_files:
        if debug_only:
            print(po_to_text(str(po_file), drop_capitalized))
            continue
        texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized)
    if debug_only:
        return 0
    try:
        output = subprocess.run(
            ["hunspell", "-d", language, "-a"] + personal_dict_arg,
            universal_newlines=True,
            input=quote_for_hunspell("\n".join(texts_for_hunspell.values())),
            stdout=subprocess.PIPE,
            check=True,
        )
    except subprocess.CalledProcessError:
        return -1
    return parse_hunspell_output(texts_for_hunspell, output)


def parse_hunspell_output(hunspell_input: Dict[str, str], hunspell_output) -> int:
    """Parse `hunspell -a` output.

    Print one line per error on stderr, of the following format:

        FILE:LINE:ERROR

    Returns the number of errors.

    hunspell_input contains a dict of files: all_lines_for_this_file.
    """
    errors = 0
    checked_files = iter(hunspell_input.items())
    checked_file_name, checked_text = next(checked_files)
    checked_lines = iter(checked_text.split("\n"))
    next(checked_lines)
    current_line_number = 1
    for line in hunspell_output.stdout.split("\n")[1:]:
        if not line:
            try:
                next(checked_lines)
                current_line_number += 1
            except StopIteration:
                try:
                    checked_file_name, checked_text = next(checked_files)
                    checked_lines = iter(checked_text.split("\n"))
                    next(checked_lines)
                    current_line_number = 1
                except StopIteration:
                    return errors
            continue
        if line == "*":  # OK
            continue
        if line[0] == "&":
            _, original, *_ = line.split()
            if look_like_a_word(original):
                print(checked_file_name, current_line_number, original, sep=":")
                errors += 1
    raise Unreachable("Got this one? I'm sorry, read XKCD 2200, then open an issue.")


def gracefull_handling_of_missing_dicts(language):
    """Check if hunspell dictionary for given language is installed."""
    hunspell_dash_d = subprocess.check_output(
        ["hunspell", "-D"], universal_newlines=True, stderr=subprocess.STDOUT
    )
    languages = {Path(line).name for line in hunspell_dash_d}

    def error(*args, file=sys.stderr, **kwargs):
        print(*args, file=file, **kwargs)

    if language in languages:
        return
    error(
        "The hunspell dictionary for your language is missing, please install it.",
        end="\n\n",
    )
    if which("apt"):
        error("Maybe try something like:")
        error("  sudo apt install hunspell-{}".format(language))
    else:
        error(
            """I don't know your environment, but I bet the package name looks like:

    hunspell-{language}

If you find it, please tell me (by opening an issue or a PR on
https://github.com/JulienPalard/pospell/) so I can enhance this error message.
""".format(
                language=language
            )
        )
    sys.exit(1)


def main():
    """Entry point (for command-line)."""
    args = parse_args()
    logging.basicConfig(level=50 - 10 * args.verbose)
    default_drop_capitalized = DEFAULT_DROP_CAPITALIZED.get(args.language, False)
    if args.drop_capitalized:
        drop_capitalized = True
    elif args.no_drop_capitalized:
        drop_capitalized = False
    else:
        drop_capitalized = default_drop_capitalized
    args.po_file = list(
        chain(Path(".").glob(args.glob) if args.glob else [], args.po_file)
    )
    if args.modified:
        git_status = subprocess.check_output(
            ["git", "status", "--porcelain"], encoding="utf-8"
        )
        git_status_lines = [
            line.split(maxsplit=2) for line in git_status.split("\n") if line
        ]
        args.po_file.extend(
            Path(filename)
            for status, filename in git_status_lines
            if filename.endswith(".po")
        )
    try:
        errors = spell_check(
            args.po_file,
            args.personal_dict,
            args.language,
            drop_capitalized,
            args.debug,
        )
    except POSpellException as err:
        print(err, file=sys.stderr)
        sys.exit(-1)
    if errors == -1:
        gracefull_handling_of_missing_dicts(args.language)
    sys.exit(0 if errors == 0 else -1)


if __name__ == "__main__":
    main()
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								"""pospell is a spellcheckers for po files containing reStructuedText."""
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								import io
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								from string import digits
 								from unicodedata import category
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								import logging
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								import subprocess
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								import sys
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								from typing import Dict
-												Ignore versions like 1.6a1.

											
										
										
											2019-08-20 14:38:03 +00:00
+								from contextlib import redirect_stderr
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								from itertools import chain
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								from pathlib import Path
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
+								from shutil import which
-												isort

											
										
										
											2018-07-27 09:38:17 +00:00
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								import docutils.frontend
 								import docutils.nodes
 								import docutils.parsers.rst
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								import polib
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								from docutils.parsers.rst import roles
 								from docutils.utils import new_document
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								import regex
-												Bump version: 1.0.10 → 1.0.11

											
										
										
											2020-10-13 22:44:09 +00:00
+								__version__ = "1.0.11"
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
 								DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
-												Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)


											
										
										
											2020-07-01 15:35:13 +00:00
-												Handle file opening errors. Closes #18.

Co-authored-by: Christophe Nanteuil <christophe.nanteuil@gmail.com>

											
										
										
											2020-10-13 22:44:05 +00:00
+								class POSpellException(Exception):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """All exceptions from this module inherit from this one."""
 								class Unreachable(POSpellException):
 								    """The code encontered a state that should be unreachable."""
-												Handle file opening errors. Closes #18.

Co-authored-by: Christophe Nanteuil <christophe.nanteuil@gmail.com>

											
										
										
											2020-10-13 22:44:05 +00:00
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								try:
 								    HUNSPELL_VERSION = subprocess.check_output(
 								        ["hunspell", "--version"], universal_newlines=True
 								    ).split("\n")[0]
 								except FileNotFoundError:
 								    print("hunspell not found, please install hunspell.", file=sys.stderr)
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								    sys.exit(1)
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
 								class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Used to represent any unknown roles, so we can parse any rst blindly."""
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
 								def monkey_patch_role(role):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Patch docutils.parsers.rst.roles.role so it always match.
 								    Giving a DummyNodeClass for unknown roles.
 								    """
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								    def role_or_generic(role_name, language_module, lineno, reporter):
 								        base_role, message = role(role_name, language_module, lineno, reporter)
 								        if base_role is None:
 								            roles.register_generic_role(role_name, DummyNodeClass)
 								            base_role, message = role(role_name, language_module, lineno, reporter)
 								        return base_role, message
 								    return role_or_generic
 								roles.role = monkey_patch_role(roles.role)
 								class NodeToTextVisitor(docutils.nodes.NodeVisitor):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Recursively convert a docutils node to a Python string.
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    Usage:
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    >>> visitor = NodeToTextVisitor(document)
 								    >>> document.walk(visitor)
 								    >>> print(str(visitor))
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    It ignores (see IGNORE_LIST) some nodes, which we don't want in
 								    hunspell (enphasis typically contain proper names that are unknown
 								    to dictionaires).
 								    """
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    IGNORE_LIST = (
 								        "emphasis",
 								        "superscript",
 								        "title_reference",
 								        "strong",
 								        "DummyNodeClass",
 								        "reference",
 								        "literal",
 								        "Text",
 								    )
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    def __init__(self, document):
 								        """Initialize visitor for the given node/document."""
 								        self.output = []
 								        super().__init__(document)
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    def unknown_visit(self, node):
 								        """Mandatory implementation to visit unknwon nodes."""
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    @staticmethod
 								    def ignore(node):
 								        """Just raise SkipChildren.
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        Used for all visit_* in the IGNORE_LIST.
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        See __getattr__.
 								        """
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
+								        raise docutils.nodes.SkipChildren
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    def __getattr__(self, name):
 								        """Skip childrens from the IGNORE_LIST."""
 								        if name.startswith("visit_") and name[6:] in self.IGNORE_LIST:
 								            return self.ignore
 								        raise AttributeError(name)
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
 								    def visit_Text(self, node):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        """Keep this node text, this is typically what we want to spell check."""
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								        self.output.append(node.rawsource)
 								    def __str__(self):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        """Give the accumulated strings."""
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								        return " ".join(self.output)
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
 								def strip_rst(line):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Transform reStructuredText to plain text."""
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								    if line.endswith("::"):
 								        # Drop :: at the end, it would cause Literal block expected
 								        line = line[:-2]
 								    parser = docutils.parsers.rst.Parser()
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								    settings = docutils.frontend.Values(
 								        {
 								            "report_level": 2,
 								            "halt_level": 4,
 								            "exit_status_level": 5,
 								            "debug": None,
 								            "warning_stream": None,
 								            "error_encoding": "utf-8",
 								            "error_encoding_error_handler": "backslashreplace",
 								            "language_code": "en",
 								            "id_prefix": "",
 								            "auto_id_prefix": "id",
 								            "pep_references": None,
 								            "pep_base_url": "http://www.python.org/dev/peps/",
 								            "pep_file_url_template": "pep-%04d",
 								            "rfc_references": None,
 								            "rfc_base_url": "http://tools.ietf.org/html/",
 								            "tab_width": 8,
 								            "trim_footnote_reference_space": None,
-												Avoid regression on issue 21.

											
										
										
											2020-10-12 12:42:33 +00:00
+								            "syntax_highlight": "long",
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								        }
 								    )
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								    stderr_stringio = io.StringIO()
 								    with redirect_stderr(stderr_stringio):
-												Ignore versions like 1.6a1.

											
										
										
											2019-08-20 14:38:03 +00:00
+								        document = new_document("<rst-doc>", settings=settings)
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								        parser.parse(line, document)
 								    stderr = stderr_stringio.getvalue()
 								    if stderr:
 								        print(stderr.strip(), "while parsing:", line)
 								    visitor = NodeToTextVisitor(document)
 								    document.walk(visitor)
 								    return str(visitor)
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
-												Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)


											
										
										
											2020-07-01 15:35:13 +00:00
+								def clear(line, drop_capitalized=False, po_path=""):
-												Bump black.

											
										
										
											2020-10-11 13:33:09 +00:00
+								    """Clear various other syntaxes we may encounter in a line."""
-												FIX: Double spaces were breaking start-of-sentence detection.

											
										
										
											2019-09-16 08:44:18 +00:00
+								    # Normalize spaces
-												Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)


											
										
										
											2020-07-01 15:35:13 +00:00
+								    line = regex.sub(r"\s+", " ", line).replace("\xad", "")
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								    to_drop = {
 								        r'<a href="[^"]*?">',
-												allow underscors in variables

											
										
										
											2019-11-16 13:47:22 +00:00
+								        r"{[a-z_]*?}",  # Sphinx variable
-												allow full list of conversion types in printf-style variables

See https://docs.python.org/3/library/stdtypes.html#old-string-formatting

											
										
										
											2019-11-18 08:52:00 +00:00
+								        r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]",  # Sphinx variable
-												FIX: Double spaces were breaking start-of-sentence detection.

											
										
										
											2019-09-16 08:44:18 +00:00
+								        r"« . »",  # Single letter examples (typically in Unicode documentation)
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								    }
-												make dropping capitalized words optional

											
										
										
											2019-10-09 11:06:45 +00:00
+								    if drop_capitalized:
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								        to_drop.add(
-												make dropping capitalized words optional

											
										
										
											2019-10-09 11:06:45 +00:00
+								            # Strip capitalized words in sentences
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								            r"(?<!\. |^|-)\b(\p{Letter}['’])?\b\p{Uppercase}\p{Letter}[\w.-]*\b"
 								        )
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								    if logging.getLogger().isEnabledFor(logging.DEBUG):
 								        for pattern in to_drop:
 								            for dropped in regex.findall(pattern, line):
-												Avoid glueing words together. Fixes #15

											
										
										
											2020-06-28 09:13:45 +00:00
+								                logging.debug(
 								                    "%s: dropping %r via %r due to from %r",
 								                    po_path,
 								                    dropped,
 								                    pattern,
 								                    line,
 								                )
 								    return regex.sub("|".join(to_drop), r" ", line)
-												Strip sphinx variables and a href links.

											
										
										
											2018-07-27 09:01:09 +00:00
-												FIX: Sync error due to line seen as commented by hunspell.

											
										
										
											2020-10-13 22:22:26 +00:00
+								def quote_for_hunspell(text):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Quote a paragraph so hunspell don't misinterpret it.
-												FIX: Sync error due to line seen as commented by hunspell.

											
										
										
											2020-10-13 22:22:26 +00:00
+								    Quoting the manpage:
 								    It is recommended that programmatic interfaces prefix
 								    every data line with an uparrow to protect themselves
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    against future changes in hunspell.
 								    """
-												FIX: Sync error due to line seen as commented by hunspell.

											
										
										
											2020-10-13 22:22:26 +00:00
+								    out = []
 								    for line in text.split("\n"):
 								        out.append("^" + line if line else "")
 								    return "\n".join(out)
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								def po_to_text(po_path, drop_capitalized=False):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Convert a po file to a text file.
 								    This strips the msgids and all po syntax while keeping lines at
 								    their same position / line number.
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								    """
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								    buffer = []
-												Report line numbers.

											
										
										
											2018-07-23 17:24:10 +00:00
+								    lines = 0
-												Handle file opening errors. Closes #18.

Co-authored-by: Christophe Nanteuil <christophe.nanteuil@gmail.com>

											
										
										
											2020-10-13 22:44:05 +00:00
+								    try:
 								        entries = polib.pofile(Path(po_path).read_text())
 								    except Exception as err:
 								        raise POSpellException(str(err)) from err
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								    for entry in entries:
-												Don't spellcheck untranslated messages, they're typically in english (licence text, project names, functions prototypes, ...)

											
										
										
											2018-07-27 08:03:21 +00:00
+								        if entry.msgid == entry.msgstr:
 								            continue
-												Report line numbers.

											
										
										
											2018-07-23 17:24:10 +00:00
+								        while lines < entry.linenum:
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								            buffer.append("")
-												Report line numbers.

											
										
										
											2018-07-23 17:24:10 +00:00
+								            lines += 1
-												Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)


											
										
										
											2020-07-01 15:35:13 +00:00
+								        buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
-												Report line numbers.

											
										
										
											2018-07-23 17:24:10 +00:00
+								        lines += 1
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								    return "\n".join(buffer)
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								def parse_args():
-												Bump black.

											
										
										
											2020-10-11 13:33:09 +00:00
+								    """Parse command line arguments."""
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								    import argparse
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								    parser = argparse.ArgumentParser(
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								        description="Check spelling in po files containing restructuredText."
 								    )
 								    parser.add_argument(
 								        "-l",
 								        "--language",
 								        type=str,
 								        default="fr",
 								        help="Language to check, you'll have to install the corresponding "
 								        "hunspell dictionary, on Debian see apt list 'hunspell-*'.",
 								    )
 								    parser.add_argument(
 								        "--glob",
 								        type=str,
 								        help="Provide a glob pattern, to be interpreted by pospell, to find po files, "
 								        "like --glob '**/*.po'.",
 								    )
-												add --keep-capitalized argument

											
										
										
											2019-10-09 11:07:09 +00:00
+								    parser.add_argument(
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								        "--drop-capitalized",
-												add --keep-capitalized argument

											
										
										
											2019-10-09 11:07:09 +00:00
+								        action="store_true",
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        help="Always drop capitalized words in sentences"
 								        " (defaults according to the language).",
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								    )
 								    parser.add_argument(
 								        "--no-drop-capitalized",
 								        action="store_true",
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        help="Never drop capitalized words in sentences"
 								        " (defaults according to the language).",
-												add --keep-capitalized argument

											
										
										
											2019-10-09 11:07:09 +00:00
+								    )
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								    parser.add_argument(
 								        "po_file",
 								        nargs="*",
-												FIX: Don't scan recursively by default, and use Path instead of strings.

											
										
										
											2018-07-27 14:21:28 +00:00
+								        type=Path,
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								        help="Files to check, can optionally be mixed with --glob, or not, "
 								        "use the one that fit your needs.",
 								    )
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								    parser.add_argument(
 								        "-v",
 								        "--verbose",
 								        action="count",
 								        default=0,
 								        help="More output, use -vv, -vvv, and so on.",
 								    )
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								    parser.add_argument(
 								        "--version",
 								        action="version",
 								        version="%(prog)s " + __version__ + " using hunspell: " + HUNSPELL_VERSION,
 								    )
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								    parser.add_argument("--debug", action="store_true")
 								    parser.add_argument("-p", "--personal-dict", type=str)
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    parser.add_argument(
 								        "--modified", "-m", action="store_true", help="Use git to find modified files."
 								    )
 								    args = parser.parse_args()
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								    if args.drop_capitalized and args.no_drop_capitalized:
 								        print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
 								        parser.print_help()
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								        sys.exit(1)
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    if not args.po_file and not args.modified:
 								        parser.print_help()
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								        sys.exit(1)
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    return args
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								def look_like_a_word(word):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Return True if the given str looks like a word.
 								    Used to filter out non-words like `---` or `-0700` so they don't
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								    get reported. They typically are not errors.
 								    """
 								    if not word:
 								        return False
 								    if any(digit in word for digit in digits):
 								        return False
 								    if len([c for c in word if category(c) == "Lu"]) > 1:
 								        return False  # Probably an accronym, or a name like CPython, macOS, SQLite, ...
 								    if "-" in word:
 								        return False
 								    return True
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								def spell_check(
-												Fix compounding error causing false negatives, hope it won't raise false positives.

											
										
										
											2020-10-11 14:04:26 +00:00
+								    po_files,
 								    personal_dict=None,
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								    language="en_US",
-												Fix compounding error causing false negatives, hope it won't raise false positives.

											
										
										
											2020-10-11 14:04:26 +00:00
+								    drop_capitalized=False,
 								    debug_only=False,
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Check for spelling mistakes in the given po_files.
 								    (po format, containing restructuredtext), for the given language.
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    personal_dict allow to pass a personal dict (-p) option, to hunspell.
 								    Debug only will show what's passed to Hunspell instead of passing it.
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								    """
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    personal_dict_arg = ["-p", personal_dict] if personal_dict else []
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								    texts_for_hunspell = {}
-												Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)


											
										
										
											2020-07-01 15:35:13 +00:00
+								    for po_file in po_files:
 								        if debug_only:
 								            print(po_to_text(str(po_file), drop_capitalized))
 								            continue
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								        texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized)
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    if debug_only:
 								        return 0
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								    try:
 								        output = subprocess.run(
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								            ["hunspell", "-d", language, "-a"] + personal_dict_arg,
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								            universal_newlines=True,
-												FIX: Sync error due to line seen as commented by hunspell.

											
										
										
											2020-10-13 22:22:26 +00:00
+								            input=quote_for_hunspell("\n".join(texts_for_hunspell.values())),
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								            stdout=subprocess.PIPE,
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								            check=True,
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								        )
 								    except subprocess.CalledProcessError:
 								        return -1
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    return parse_hunspell_output(texts_for_hunspell, output)
 								def parse_hunspell_output(hunspell_input: Dict[str, str], hunspell_output) -> int:
 								    """Parse `hunspell -a` output.
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    Print one line per error on stderr, of the following format:
 								        FILE:LINE:ERROR
 								    Returns the number of errors.
 								    hunspell_input contains a dict of files: all_lines_for_this_file.
 								    """
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								    errors = 0
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    checked_files = iter(hunspell_input.items())
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								    checked_file_name, checked_text = next(checked_files)
 								    checked_lines = iter(checked_text.split("\n"))
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    next(checked_lines)
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								    current_line_number = 1
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    for line in hunspell_output.stdout.split("\n")[1:]:
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								        if not line:
 								            try:
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								                next(checked_lines)
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								                current_line_number += 1
 								            except StopIteration:
 								                try:
 								                    checked_file_name, checked_text = next(checked_files)
 								                    checked_lines = iter(checked_text.split("\n"))
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								                    next(checked_lines)
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								                    current_line_number = 1
 								                except StopIteration:
 								                    return errors
 								            continue
 								        if line == "*":  # OK
 								            continue
 								        if line[0] == "&":
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								            _, original, *_ = line.split()
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								            if look_like_a_word(original):
 								                print(checked_file_name, current_line_number, original, sep=":")
 								                errors += 1
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    raise Unreachable("Got this one? I'm sorry, read XKCD 2200, then open an issue.")
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
+								def gracefull_handling_of_missing_dicts(language):
-												Bump black.

											
										
										
											2020-10-11 13:33:09 +00:00
+								    """Check if hunspell dictionary for given language is installed."""
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
+								    hunspell_dash_d = subprocess.check_output(
 								        ["hunspell", "-D"], universal_newlines=True, stderr=subprocess.STDOUT
 								    )
 								    languages = {Path(line).name for line in hunspell_dash_d}
 								    def error(*args, file=sys.stderr, **kwargs):
 								        print(*args, file=file, **kwargs)
 								    if language in languages:
 								        return
 								    error(
 								        "The hunspell dictionary for your language is missing, please install it.",
 								        end="\n\n",
 								    )
 								    if which("apt"):
 								        error("Maybe try something like:")
 								        error("  sudo apt install hunspell-{}".format(language))
 								    else:
 								        error(
 								            """I don't know your environment, but I bet the package name looks like:
 								    hunspell-{language}
 								If you find it, please tell me (by opening an issue or a PR on
 								https://github.com/JulienPalard/pospell/) so I can enhance this error message.
 								""".format(
 								                language=language
 								            )
 								        )
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								    sys.exit(1)
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								def main():
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Entry point (for command-line)."""
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    args = parse_args()
 								    logging.basicConfig(level=50 - 10 * args.verbose)
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								    default_drop_capitalized = DEFAULT_DROP_CAPITALIZED.get(args.language, False)
 								    if args.drop_capitalized:
 								        drop_capitalized = True
 								    elif args.no_drop_capitalized:
 								        drop_capitalized = False
 								    else:
 								        drop_capitalized = default_drop_capitalized
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    args.po_file = list(
 								        chain(Path(".").glob(args.glob) if args.glob else [], args.po_file)
 								    )
 								    if args.modified:
 								        git_status = subprocess.check_output(
 								            ["git", "status", "--porcelain"], encoding="utf-8"
 								        )
 								        git_status_lines = [
 								            line.split(maxsplit=2) for line in git_status.split("\n") if line
 								        ]
 								        args.po_file.extend(
 								            Path(filename)
 								            for status, filename in git_status_lines
 								            if filename.endswith(".po")
 								        )
-												Handle file opening errors. Closes #18.

Co-authored-by: Christophe Nanteuil <christophe.nanteuil@gmail.com>

											
										
										
											2020-10-13 22:44:05 +00:00
+								    try:
 								        errors = spell_check(
 								            args.po_file,
 								            args.personal_dict,
 								            args.language,
 								            drop_capitalized,
 								            args.debug,
 								        )
 								    except POSpellException as err:
 								        print(err, file=sys.stderr)
 								        sys.exit(-1)
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
+								    if errors == -1:
 								        gracefull_handling_of_missing_dicts(args.language)
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								    sys.exit(0 if errors == 0 else -1)
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								if __name__ == "__main__":
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								    main()