pospell/pospell.py

"""pospell is a spellcheckers for po files containing reStructuedText."""
import collections
import functools
import io
import logging
import multiprocessing
import os
import subprocess
import sys
from contextlib import redirect_stderr
from itertools import chain
from pathlib import Path
from shutil import which
from string import digits
from typing import List, Tuple
from unicodedata import category

import docutils.frontend
import docutils.nodes
import docutils.parsers.rst
import polib
import regex
from docutils.parsers.rst import roles
from docutils.utils import new_document
from sphinxlint import rst

__version__ = "1.3"

DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}

Error = Tuple[str, int, str]

input_line = collections.namedtuple("input_line", "filename line text")


class POSpellException(Exception):
    """All exceptions from this module inherit from this one."""


class Unreachable(POSpellException):
    """The code encontered a state that should be unreachable."""


try:
    HUNSPELL_VERSION = subprocess.check_output(
        ["hunspell", "--version"], universal_newlines=True
    ).split("\n", maxsplit=1)[0]
except FileNotFoundError:
    print("hunspell not found, please install hunspell.", file=sys.stderr)
    sys.exit(1)


class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
    """Used to represent any unknown roles, so we can parse any rst blindly."""


def monkey_patch_role(role):
    """Patch docutils.parsers.rst.roles.role so it always match.

    Giving a DummyNodeClass for unknown roles.
    """

    def role_or_generic(role_name, language_module, lineno, reporter):
        base_role, message = role(role_name, language_module, lineno, reporter)
        if base_role is None:
            roles.register_generic_role(role_name, DummyNodeClass)
            base_role, message = role(role_name, language_module, lineno, reporter)
        return base_role, message

    return role_or_generic


roles.role = monkey_patch_role(roles.role)


class NodeToTextVisitor(docutils.nodes.NodeVisitor):
    """Recursively convert a docutils node to a Python string.

    Usage:

    >>> visitor = NodeToTextVisitor(document)
    >>> document.walk(visitor)
    >>> print(str(visitor))

    It ignores (see IGNORE_LIST) some nodes, which we don't want in
    hunspell (enphasis typically contain proper names that are unknown
    to dictionaires).
    """

    IGNORE_LIST = (
        "emphasis",
        "superscript",
        "title_reference",
        "substitution_reference",
        "citation_reference",
        "strong",
        "DummyNodeClass",
        "reference",
        "literal",
        "Text",
        "system_message",
    )

    def __init__(self, document):
        """Initialize visitor for the given node/document."""
        self.output = []
        super().__init__(document)

    def unknown_visit(self, node):
        """Mandatory implementation to visit unknwon nodes."""

    @staticmethod
    def ignore(node):
        """Just raise SkipChildren.

        Used for all visit_* in the IGNORE_LIST.

        See __getattr__.
        """
        raise docutils.nodes.SkipChildren

    def __getattr__(self, name):
        """Skip childrens from the IGNORE_LIST."""
        if name.startswith("visit_") and name[6:] in self.IGNORE_LIST:
            return self.ignore
        raise AttributeError(name)

    def visit_Text(self, node):
        """Keep this node text, this is typically what we want to spell check."""
        self.output.append(docutils.nodes.unescape(node, restore_backslashes=True))

    def __str__(self):
        """Give the accumulated strings."""
        return " ".join(self.output)


def strip_rst(line):
    """Transform reStructuredText to plain text."""
    if line.endswith("::"):
        # Drop :: at the end, it would cause Literal block expected
        line = line[:-2]
    line = rst.NORMAL_ROLE_RE.sub("", line)
    settings = docutils.frontend.get_default_settings()
    settings.pep_references = None
    settings.rfc_references = None
    settings.pep_base_url = "http://www.python.org/dev/peps/"
    settings.pep_file_url_template = "pep-%04d"
    parser = docutils.parsers.rst.Parser()
    stderr_stringio = io.StringIO()
    with redirect_stderr(stderr_stringio):
        document = new_document("<rst-doc>", settings=settings)
        parser.parse(line, document)
    stderr = stderr_stringio.getvalue()
    if stderr:
        print(stderr.strip(), "while parsing:", line)
    visitor = NodeToTextVisitor(document)
    document.walk(visitor)
    return str(visitor)


def clear(line, drop_capitalized=False, po_path=""):
    """Clear various other syntaxes we may encounter in a line."""
    # Normalize spaces
    line = regex.sub(r"\s+", " ", line).replace("\xad", "")

    to_drop = {
        r'<a href="[^"]*?">',
        r"{[a-z_]*?}",  # Sphinx variable
        r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]",  # Sphinx variable
        r"« . »",  # Single letter examples (typically in Unicode documentation)
    }
    if drop_capitalized:
        to_drop.add(
            # Strip capitalized words in sentences
            r"(?<!\. |^|-)\b(\p{Letter}['’])?\b\p{Uppercase}\p{Letter}[\w.-]*\b"
        )
    if logging.getLogger().isEnabledFor(logging.DEBUG):
        for pattern in to_drop:
            for dropped in regex.findall(pattern, line):
                logging.debug(
                    "%s: dropping %r via %r due to from %r",
                    po_path,
                    dropped,
                    pattern,
                    line,
                )
    return regex.sub("|".join(to_drop), r" ", line)


def quote_for_hunspell(text):
    """Quote a paragraph so hunspell don't misinterpret it.

    Quoting the manpage:
    It is recommended that programmatic interfaces prefix
    every data line with an uparrow to protect themselves
    against future changes in hunspell.
    """
    out = []
    for line in text:
        out.append("^" + line if line else "")
    return "\n".join(out)


def po_to_text(po_path, drop_capitalized=False):
    """Convert a po file to a text file.

    This strips the msgids and all po syntax while keeping lines at
    their same position / line number.
    """
    input_lines = []
    lines = 0
    try:
        entries = polib.pofile(Path(po_path).read_text(encoding="UTF-8"))
    except Exception as err:
        raise POSpellException(str(err)) from err
    for entry in entries:
        if entry.msgid == entry.msgstr:
            continue
        if entry.obsolete:
            continue
        while lines < entry.linenum:
            lines += 1
            input_lines.append(input_line(po_path, lines, ""))
        lines += 1
        input_lines.append(
            input_line(
                po_path,
                lines,
                clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path),
            )
        )
    return input_lines


def parse_args():
    """Parse command line arguments."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Check spelling in po files containing restructuredText."
    )
    parser.add_argument(
        "-l",
        "--language",
        type=str,
        default="fr",
        help="Language to check, you'll have to install the corresponding "
        "hunspell dictionary, on Debian see apt list 'hunspell-*' (defaults to 'fr').",
    )
    parser.add_argument(
        "--glob",
        type=str,
        help="Provide a glob pattern, to be interpreted by pospell, to find po files, "
        "like --glob '**/*.po'.",
    )
    parser.add_argument(
        "--drop-capitalized",
        action="store_true",
        help="Always drop capitalized words in sentences"
        " (defaults according to the language).",
    )
    parser.add_argument(
        "--no-drop-capitalized",
        action="store_true",
        help="Never drop capitalized words in sentences"
        " (defaults according to the language).",
    )
    parser.add_argument(
        "po_file",
        nargs="*",
        type=Path,
        help="Files to check, can optionally be mixed with --glob, or not, "
        "use the one that fit your needs.",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="count",
        default=0,
        help="More output, use -vv, -vvv, and so on.",
    )
    parser.add_argument(
        "--version",
        action="version",
        version="%(prog)s " + __version__ + " using hunspell: " + HUNSPELL_VERSION,
    )
    parser.add_argument("--debug", action="store_true")
    parser.add_argument("-p", "--personal-dict", type=Path)
    parser.add_argument(
        "--modified", "-m", action="store_true", help="Use git to find modified files."
    )
    parser.add_argument(
        "-j",
        "--jobs",
        type=int,
        default=os.cpu_count(),
        help="Number of files to check in paralel, defaults to all available CPUs",
    )
    args = parser.parse_args()
    if args.personal_dict is not None and not args.personal_dict.exists():
        print(f"Error: dictionary {str(args.personal_dict)!r} not found.")
        sys.exit(1)
    if args.drop_capitalized and args.no_drop_capitalized:
        print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
        parser.print_help()
        sys.exit(1)
    if not args.po_file and not args.modified and not args.glob:
        parser.print_help()
        sys.exit(1)
    return args


def look_like_a_word(word):
    """Return True if the given str looks like a word.

    Used to filter out non-words like `---` or `-0700` so they don't
    get reported. They typically are not errors.
    """
    if not word:
        return False
    if any(digit in word for digit in digits):
        return False
    if len([c for c in word if category(c) == "Lu"]) > 1:
        return False  # Probably an accronym, or a name like CPython, macOS, SQLite, ...
    if "-" in word:
        return False
    return True


def run_hunspell(language, personal_dict, input_lines) -> List[Error]:
    """Run hunspell over the given input lines."""
    personal_dict_arg = ["-p", personal_dict] if personal_dict else []
    try:
        output = subprocess.check_output(
            ["hunspell", "-d", language, "-a"] + personal_dict_arg,
            universal_newlines=True,
            input=quote_for_hunspell(text for _, _, text in input_lines),
        )
    except subprocess.CalledProcessError:
        return []
    return parse_hunspell_output(input_lines, output.splitlines())


def flatten(list_of_lists):
    """[[a,b,c], [d,e,f]] -> [a,b,c,d,e,f]."""
    return [element for a_list in list_of_lists for element in a_list]


def spell_check(
    po_files,
    personal_dict=None,
    language="en_US",
    drop_capitalized=False,
    debug_only=False,
    jobs=os.cpu_count(),
):
    """Check for spelling mistakes in the given po_files.

    (po format, containing restructuredtext), for the given language.
    personal_dict allow to pass a personal dict (-p) option, to hunspell.

    Debug only will show what's passed to Hunspell instead of passing it.
    """
    # Pool.__exit__ calls terminate() instead of close(), we need the latter,
    # which ensures the processes' atexit handlers execute fully, which in
    # turn lets coverage write the sub-processes' coverage information
    pool = multiprocessing.Pool(jobs)  # pylint: disable=consider-using-with
    try:
        input_lines = flatten(
            pool.map(
                functools.partial(po_to_text, drop_capitalized=drop_capitalized),
                po_files,
            )
        )
        if debug_only:
            for filename, line, text in input_lines:
                print(filename, line, text, sep=":")
            return 0
        if not input_lines:
            return 0

        # Distribute input lines across workers
        lines_per_job = (len(input_lines) + jobs - 1) // jobs
        chunked_inputs = [
            input_lines[i : i + lines_per_job]
            for i in range(0, len(input_lines), lines_per_job)
        ]
        errors = flatten(
            pool.map(
                functools.partial(run_hunspell, language, personal_dict),
                chunked_inputs,
            )
        )
    finally:
        pool.close()
        pool.join()

    for error in errors:
        print(*error, sep=":")
    return len(errors)


def parse_hunspell_output(inputs, outputs) -> List[Error]:
    """Parse `hunspell -a` output and collect all errors."""
    # skip first line of hunspell output (it's the banner)
    outputs = iter(outputs[1:])
    errors = []
    for po_input_line, output_line in zip(inputs, outputs):
        if not po_input_line.text:
            continue
        while output_line:
            if output_line.startswith("&"):
                _, original, *_ = output_line.split()
                if look_like_a_word(original):
                    errors.append(
                        (po_input_line.filename, po_input_line.line, original)
                    )
            try:
                output_line = next(outputs)
            except StopIteration:
                break
    return errors


def gracefull_handling_of_missing_dicts(language):
    """Check if hunspell dictionary for given language is installed."""
    hunspell_dash_d = subprocess.check_output(
        ["hunspell", "-D"], universal_newlines=True, stderr=subprocess.STDOUT
    )
    languages = {Path(line).name for line in hunspell_dash_d}

    def error(*args, file=sys.stderr, **kwargs):
        print(*args, file=file, **kwargs)

    if language in languages:
        return
    error(
        "The hunspell dictionary for your language is missing, please install it.",
        end="\n\n",
    )
    if which("apt"):
        error("Maybe try something like:")
        error(f"  sudo apt install hunspell-{language}")
    else:
        error(
            f"""I don't know your environment, but I bet the package name looks like:

    hunspell-{language}

If you find it, please tell me (by opening an issue or a PR on
https://github.com/JulienPalard/pospell/) so I can enhance this error message.
"""
        )
    sys.exit(1)


def main():
    """Entry point (for command-line)."""
    args = parse_args()
    logging.basicConfig(level=50 - 10 * args.verbose)
    default_drop_capitalized = DEFAULT_DROP_CAPITALIZED.get(args.language, False)
    if args.drop_capitalized:
        drop_capitalized = True
    elif args.no_drop_capitalized:
        drop_capitalized = False
    else:
        drop_capitalized = default_drop_capitalized
    args.po_file = list(
        chain(Path(".").glob(args.glob) if args.glob else [], args.po_file)
    )
    if args.modified:
        git_status = subprocess.check_output(
            ["git", "status", "--porcelain", "--no-renames"], encoding="utf-8"
        )
        git_status_lines = [
            line.split(maxsplit=2) for line in git_status.split("\n") if line
        ]
        args.po_file.extend(
            Path(filename)
            for status, filename in git_status_lines
            if filename.endswith(".po") and status != "D"
        )
    try:
        errors = spell_check(
            args.po_file,
            args.personal_dict,
            args.language,
            drop_capitalized,
            args.debug,
            args.jobs,
        )
    except POSpellException as err:
        print(err, file=sys.stderr)
        sys.exit(-1)
    if errors == -1:
        gracefull_handling_of_missing_dicts(args.language)
    sys.exit(0 if errors == 0 else -1)


if __name__ == "__main__":
    main()
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								"""pospell is a spellcheckers for po files containing reStructuedText."""
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								import collections
 								import functools
-												FIX: Discrepancy between docutils rst and sphinx rst

:rfc: don't allow aliases in docutils implementation.

See: https://sourceforge.net/p/docutils/feature-requests/75/

											
										
										
											2023-07-21 07:05:52 +00:00
+								import io
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								import logging
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								import multiprocessing
 								import os
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								import subprocess
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								import sys
-												Ignore versions like 1.6a1.

											
										
										
											2019-08-20 14:38:03 +00:00
+								from contextlib import redirect_stderr
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								from itertools import chain
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								from pathlib import Path
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
+								from shutil import which
-												FIX: Discrepancy between docutils rst and sphinx rst

:rfc: don't allow aliases in docutils implementation.

See: https://sourceforge.net/p/docutils/feature-requests/75/

											
										
										
											2023-07-21 07:05:52 +00:00
+								from string import digits
 								from typing import List, Tuple
 								from unicodedata import category
-												isort

											
										
										
											2018-07-27 09:38:17 +00:00
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								import docutils.frontend
 								import docutils.nodes
 								import docutils.parsers.rst
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								import polib
-												FIX: Discrepancy between docutils rst and sphinx rst

:rfc: don't allow aliases in docutils implementation.

See: https://sourceforge.net/p/docutils/feature-requests/75/

											
										
										
											2023-07-21 07:05:52 +00:00
+								import regex
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								from docutils.parsers.rst import roles
 								from docutils.utils import new_document
-												FIX: Discrepancy between docutils rst and sphinx rst

:rfc: don't allow aliases in docutils implementation.

See: https://sourceforge.net/p/docutils/feature-requests/75/

											
										
										
											2023-07-21 07:05:52 +00:00
+								from sphinxlint import rst
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
-												FIX: Discrepancy between docutils rst and sphinx rst

:rfc: don't allow aliases in docutils implementation.

See: https://sourceforge.net/p/docutils/feature-requests/75/

											
										
										
											2023-07-21 07:05:52 +00:00
+								__version__ = "1.3"
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
 								DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
-												Bump min Python version to 3.7.

Because I do no longer have a 3.6 on my machine to test it.

											
										
										
											2023-11-20 10:57:44 +00:00
+								Error = Tuple[str, int, str]
-												Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)


											
										
										
											2020-07-01 15:35:13 +00:00
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								input_line = collections.namedtuple("input_line", "filename line text")
-												Handle file opening errors. Closes #18.

Co-authored-by: Christophe Nanteuil <christophe.nanteuil@gmail.com>

											
										
										
											2020-10-13 22:44:05 +00:00
+								class POSpellException(Exception):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """All exceptions from this module inherit from this one."""
 								class Unreachable(POSpellException):
 								    """The code encontered a state that should be unreachable."""
-												Handle file opening errors. Closes #18.

Co-authored-by: Christophe Nanteuil <christophe.nanteuil@gmail.com>

											
										
										
											2020-10-13 22:44:05 +00:00
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								try:
 								    HUNSPELL_VERSION = subprocess.check_output(
 								        ["hunspell", "--version"], universal_newlines=True
-												Pleases pylint and mypy.

											
										
										
											2021-10-27 15:22:08 +00:00
+								    ).split("\n", maxsplit=1)[0]
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								except FileNotFoundError:
 								    print("hunspell not found, please install hunspell.", file=sys.stderr)
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								    sys.exit(1)
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
 								class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Used to represent any unknown roles, so we can parse any rst blindly."""
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
 								def monkey_patch_role(role):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Patch docutils.parsers.rst.roles.role so it always match.
 								    Giving a DummyNodeClass for unknown roles.
 								    """
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								    def role_or_generic(role_name, language_module, lineno, reporter):
 								        base_role, message = role(role_name, language_module, lineno, reporter)
 								        if base_role is None:
 								            roles.register_generic_role(role_name, DummyNodeClass)
 								            base_role, message = role(role_name, language_module, lineno, reporter)
 								        return base_role, message
 								    return role_or_generic
 								roles.role = monkey_patch_role(roles.role)
 								class NodeToTextVisitor(docutils.nodes.NodeVisitor):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Recursively convert a docutils node to a Python string.
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    Usage:
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    >>> visitor = NodeToTextVisitor(document)
 								    >>> document.walk(visitor)
 								    >>> print(str(visitor))
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    It ignores (see IGNORE_LIST) some nodes, which we don't want in
 								    hunspell (enphasis typically contain proper names that are unknown
 								    to dictionaires).
 								    """
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    IGNORE_LIST = (
 								        "emphasis",
 								        "superscript",
 								        "title_reference",
-												Adjust raw text extraction from docutils documents (#33)

The previous version of this code relied on the Text.rawsource attribute
to obtain the raw, original version of the translated texts contained in
.po files. This attribute however was removed in docutils 0.18, and thus
a different way of obtaining this information was needed.

(Note that this attribute removal was planned, but not for this release
yet: it's currently listed not in 0.18's list of changes, but under
"Future changes". https://sourceforge.net/p/docutils/bugs/437/ has been
opened to get this eventually clarified)

The commit that removed the Text.rawsource mentioned that the data fed
into the Text elements was already the raw source, hence there was no
need to keep a separate attribute. Text objects derive from str, so we
can directly add them to the list of strings where NodeToTextVisitor
builds the original text, with the caveat that it needs to have
backslashes restored (they are encoded as null bytes after parsing,
apparently).

The other side-effect of using the Text objects directly instead of the
Text.rawsoource attribute is that now we get more of them. The document
resulting from docutils' parsing can contain system_message elements
with debugging information from the parsing process, such as warnings.
These are Text elements with no rawsource, but with actual text, so we
need to skip them. In the same spirit, citation_references and
substitution_references need to be ignored as well.

All these changes allow pospell to work against the latest docutils. On
the other hand, the lowest supported version is 0.16: 0.11 through 0.14
failed at rfc role parsing (used for example in the python docs), and
0.15 didn't have a method to restore backslashes (which again made the
python docs fail).

Signed-off-by: Rodrigo Tobar <rtobar@icrar.org>
											
										
										
											2021-11-30 16:57:04 +00:00
+								        "substitution_reference",
 								        "citation_reference",
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        "strong",
 								        "DummyNodeClass",
 								        "reference",
 								        "literal",
 								        "Text",
-												Adjust raw text extraction from docutils documents (#33)

The previous version of this code relied on the Text.rawsource attribute
to obtain the raw, original version of the translated texts contained in
.po files. This attribute however was removed in docutils 0.18, and thus
a different way of obtaining this information was needed.

(Note that this attribute removal was planned, but not for this release
yet: it's currently listed not in 0.18's list of changes, but under
"Future changes". https://sourceforge.net/p/docutils/bugs/437/ has been
opened to get this eventually clarified)

The commit that removed the Text.rawsource mentioned that the data fed
into the Text elements was already the raw source, hence there was no
need to keep a separate attribute. Text objects derive from str, so we
can directly add them to the list of strings where NodeToTextVisitor
builds the original text, with the caveat that it needs to have
backslashes restored (they are encoded as null bytes after parsing,
apparently).

The other side-effect of using the Text objects directly instead of the
Text.rawsoource attribute is that now we get more of them. The document
resulting from docutils' parsing can contain system_message elements
with debugging information from the parsing process, such as warnings.
These are Text elements with no rawsource, but with actual text, so we
need to skip them. In the same spirit, citation_references and
substitution_references need to be ignored as well.

All these changes allow pospell to work against the latest docutils. On
the other hand, the lowest supported version is 0.16: 0.11 through 0.14
failed at rfc role parsing (used for example in the python docs), and
0.15 didn't have a method to restore backslashes (which again made the
python docs fail).

Signed-off-by: Rodrigo Tobar <rtobar@icrar.org>
											
										
										
											2021-11-30 16:57:04 +00:00
+								        "system_message",
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    )
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    def __init__(self, document):
 								        """Initialize visitor for the given node/document."""
 								        self.output = []
 								        super().__init__(document)
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    def unknown_visit(self, node):
 								        """Mandatory implementation to visit unknwon nodes."""
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    @staticmethod
 								    def ignore(node):
 								        """Just raise SkipChildren.
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        Used for all visit_* in the IGNORE_LIST.
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        See __getattr__.
 								        """
-												Compatibility with docutils 0.15

											
										
										
											2019-07-26 15:40:48 +00:00
+								        raise docutils.nodes.SkipChildren
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    def __getattr__(self, name):
 								        """Skip childrens from the IGNORE_LIST."""
 								        if name.startswith("visit_") and name[6:] in self.IGNORE_LIST:
 								            return self.ignore
 								        raise AttributeError(name)
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
 								    def visit_Text(self, node):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        """Keep this node text, this is typically what we want to spell check."""
-												Adjust raw text extraction from docutils documents (#33)

The previous version of this code relied on the Text.rawsource attribute
to obtain the raw, original version of the translated texts contained in
.po files. This attribute however was removed in docutils 0.18, and thus
a different way of obtaining this information was needed.

(Note that this attribute removal was planned, but not for this release
yet: it's currently listed not in 0.18's list of changes, but under
"Future changes". https://sourceforge.net/p/docutils/bugs/437/ has been
opened to get this eventually clarified)

The commit that removed the Text.rawsource mentioned that the data fed
into the Text elements was already the raw source, hence there was no
need to keep a separate attribute. Text objects derive from str, so we
can directly add them to the list of strings where NodeToTextVisitor
builds the original text, with the caveat that it needs to have
backslashes restored (they are encoded as null bytes after parsing,
apparently).

The other side-effect of using the Text objects directly instead of the
Text.rawsoource attribute is that now we get more of them. The document
resulting from docutils' parsing can contain system_message elements
with debugging information from the parsing process, such as warnings.
These are Text elements with no rawsource, but with actual text, so we
need to skip them. In the same spirit, citation_references and
substitution_references need to be ignored as well.

All these changes allow pospell to work against the latest docutils. On
the other hand, the lowest supported version is 0.16: 0.11 through 0.14
failed at rfc role parsing (used for example in the python docs), and
0.15 didn't have a method to restore backslashes (which again made the
python docs fail).

Signed-off-by: Rodrigo Tobar <rtobar@icrar.org>
											
										
										
											2021-11-30 16:57:04 +00:00
+								        self.output.append(docutils.nodes.unescape(node, restore_backslashes=True))
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
 								    def __str__(self):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        """Give the accumulated strings."""
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								        return " ".join(self.output)
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
 								def strip_rst(line):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Transform reStructuredText to plain text."""
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								    if line.endswith("::"):
 								        # Drop :: at the end, it would cause Literal block expected
 								        line = line[:-2]
-												FIX: Discrepancy between docutils rst and sphinx rst

:rfc: don't allow aliases in docutils implementation.

See: https://sourceforge.net/p/docutils/feature-requests/75/

											
										
										
											2023-07-21 07:05:52 +00:00
+								    line = rst.NORMAL_ROLE_RE.sub("", line)
-												docutils is migrationg to argparse.

											
										
										
											2023-07-19 08:46:04 +00:00
+								    settings = docutils.frontend.get_default_settings()
 								    settings.pep_references = None
 								    settings.rfc_references = None
 								    settings.pep_base_url = "http://www.python.org/dev/peps/"
 								    settings.pep_file_url_template = "pep-%04d"
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								    parser = docutils.parsers.rst.Parser()
 								    stderr_stringio = io.StringIO()
 								    with redirect_stderr(stderr_stringio):
-												Ignore versions like 1.6a1.

											
										
										
											2019-08-20 14:38:03 +00:00
+								        document = new_document("<rst-doc>", settings=settings)
-												Use docutils to parse rst.

											
										
										
											2018-07-27 19:57:44 +00:00
+								        parser.parse(line, document)
 								    stderr = stderr_stringio.getvalue()
 								    if stderr:
 								        print(stderr.strip(), "while parsing:", line)
 								    visitor = NodeToTextVisitor(document)
 								    document.walk(visitor)
 								    return str(visitor)
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
-												Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)


											
										
										
											2020-07-01 15:35:13 +00:00
+								def clear(line, drop_capitalized=False, po_path=""):
-												Bump black.

											
										
										
											2020-10-11 13:33:09 +00:00
+								    """Clear various other syntaxes we may encounter in a line."""
-												FIX: Double spaces were breaking start-of-sentence detection.

											
										
										
											2019-09-16 08:44:18 +00:00
+								    # Normalize spaces
-												Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)


											
										
										
											2020-07-01 15:35:13 +00:00
+								    line = regex.sub(r"\s+", " ", line).replace("\xad", "")
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								    to_drop = {
 								        r'<a href="[^"]*?">',
-												allow underscors in variables

											
										
										
											2019-11-16 13:47:22 +00:00
+								        r"{[a-z_]*?}",  # Sphinx variable
-												allow full list of conversion types in printf-style variables

See https://docs.python.org/3/library/stdtypes.html#old-string-formatting

											
										
										
											2019-11-18 08:52:00 +00:00
+								        r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]",  # Sphinx variable
-												FIX: Double spaces were breaking start-of-sentence detection.

											
										
										
											2019-09-16 08:44:18 +00:00
+								        r"« . »",  # Single letter examples (typically in Unicode documentation)
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								    }
-												make dropping capitalized words optional

											
										
										
											2019-10-09 11:06:45 +00:00
+								    if drop_capitalized:
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								        to_drop.add(
-												make dropping capitalized words optional

											
										
										
											2019-10-09 11:06:45 +00:00
+								            # Strip capitalized words in sentences
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								            r"(?<!\. |^|-)\b(\p{Letter}['’])?\b\p{Uppercase}\p{Letter}[\w.-]*\b"
 								        )
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								    if logging.getLogger().isEnabledFor(logging.DEBUG):
 								        for pattern in to_drop:
 								            for dropped in regex.findall(pattern, line):
-												Avoid glueing words together. Fixes #15

											
										
										
											2020-06-28 09:13:45 +00:00
+								                logging.debug(
 								                    "%s: dropping %r via %r due to from %r",
 								                    po_path,
 								                    dropped,
 								                    pattern,
 								                    line,
 								                )
 								    return regex.sub("|".join(to_drop), r" ", line)
-												Strip sphinx variables and a href links.

											
										
										
											2018-07-27 09:01:09 +00:00
-												FIX: Sync error due to line seen as commented by hunspell.

											
										
										
											2020-10-13 22:22:26 +00:00
+								def quote_for_hunspell(text):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Quote a paragraph so hunspell don't misinterpret it.
-												FIX: Sync error due to line seen as commented by hunspell.

											
										
										
											2020-10-13 22:22:26 +00:00
+								    Quoting the manpage:
 								    It is recommended that programmatic interfaces prefix
 								    every data line with an uparrow to protect themselves
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    against future changes in hunspell.
 								    """
-												FIX: Sync error due to line seen as commented by hunspell.

											
										
										
											2020-10-13 22:22:26 +00:00
+								    out = []
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								    for line in text:
-												FIX: Sync error due to line seen as commented by hunspell.

											
										
										
											2020-10-13 22:22:26 +00:00
+								        out.append("^" + line if line else "")
 								    return "\n".join(out)
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								def po_to_text(po_path, drop_capitalized=False):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Convert a po file to a text file.
 								    This strips the msgids and all po syntax while keeping lines at
 								    their same position / line number.
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								    """
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								    input_lines = []
-												Report line numbers.

											
										
										
											2018-07-23 17:24:10 +00:00
+								    lines = 0
-												Handle file opening errors. Closes #18.

Co-authored-by: Christophe Nanteuil <christophe.nanteuil@gmail.com>

											
										
										
											2020-10-13 22:44:05 +00:00
+								    try:
-												Pleases pylint and mypy.

											
										
										
											2021-10-27 15:22:08 +00:00
+								        entries = polib.pofile(Path(po_path).read_text(encoding="UTF-8"))
-												Handle file opening errors. Closes #18.

Co-authored-by: Christophe Nanteuil <christophe.nanteuil@gmail.com>

											
										
										
											2020-10-13 22:44:05 +00:00
+								    except Exception as err:
 								        raise POSpellException(str(err)) from err
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								    for entry in entries:
-												Don't spellcheck untranslated messages, they're typically in english (licence text, project names, functions prototypes, ...)

											
										
										
											2018-07-27 08:03:21 +00:00
+								        if entry.msgid == entry.msgstr:
 								            continue
-												Don't run hunspell on obsolete values.

											
										
										
											2023-04-10 14:17:11 +00:00
+								        if entry.obsolete:
 								            continue
-												Report line numbers.

											
										
										
											2018-07-23 17:24:10 +00:00
+								        while lines < entry.linenum:
 								            lines += 1
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								            input_lines.append(input_line(po_path, lines, ""))
-												Report line numbers.

											
										
										
											2018-07-23 17:24:10 +00:00
+								        lines += 1
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								        input_lines.append(
 								            input_line(
 								                po_path,
 								                lines,
 								                clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path),
 								            )
 								        )
 								    return input_lines
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								def parse_args():
-												Bump black.

											
										
										
											2020-10-11 13:33:09 +00:00
+								    """Parse command line arguments."""
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								    import argparse
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								    parser = argparse.ArgumentParser(
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								        description="Check spelling in po files containing restructuredText."
 								    )
 								    parser.add_argument(
 								        "-l",
 								        "--language",
 								        type=str,
 								        default="fr",
 								        help="Language to check, you'll have to install the corresponding "
-												Make explicit that French is the default language used

											
										
										
											2023-11-07 17:19:19 +00:00
+								        "hunspell dictionary, on Debian see apt list 'hunspell-*' (defaults to 'fr').",
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								    )
 								    parser.add_argument(
 								        "--glob",
 								        type=str,
 								        help="Provide a glob pattern, to be interpreted by pospell, to find po files, "
 								        "like --glob '**/*.po'.",
 								    )
-												add --keep-capitalized argument

											
										
										
											2019-10-09 11:07:09 +00:00
+								    parser.add_argument(
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								        "--drop-capitalized",
-												add --keep-capitalized argument

											
										
										
											2019-10-09 11:07:09 +00:00
+								        action="store_true",
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        help="Always drop capitalized words in sentences"
 								        " (defaults according to the language).",
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								    )
 								    parser.add_argument(
 								        "--no-drop-capitalized",
 								        action="store_true",
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								        help="Never drop capitalized words in sentences"
 								        " (defaults according to the language).",
-												add --keep-capitalized argument

											
										
										
											2019-10-09 11:07:09 +00:00
+								    )
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								    parser.add_argument(
 								        "po_file",
 								        nargs="*",
-												FIX: Don't scan recursively by default, and use Path instead of strings.

											
										
										
											2018-07-27 14:21:28 +00:00
+								        type=Path,
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								        help="Files to check, can optionally be mixed with --glob, or not, "
 								        "use the one that fit your needs.",
 								    )
-												Better handling of capitalized words..

											
										
										
											2018-07-31 22:20:03 +00:00
+								    parser.add_argument(
 								        "-v",
 								        "--verbose",
 								        action="count",
 								        default=0,
 								        help="More output, use -vv, -vvv, and so on.",
 								    )
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								    parser.add_argument(
 								        "--version",
 								        action="version",
 								        version="%(prog)s " + __version__ + " using hunspell: " + HUNSPELL_VERSION,
 								    )
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								    parser.add_argument("--debug", action="store_true")
-												Explicitly fail if dict is missing.

											
										
										
											2023-04-10 14:58:31 +00:00
+								    parser.add_argument("-p", "--personal-dict", type=Path)
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    parser.add_argument(
 								        "--modified", "-m", action="store_true", help="Use git to find modified files."
 								    )
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								    parser.add_argument(
 								        "-j",
 								        "--jobs",
 								        type=int,
 								        default=os.cpu_count(),
 								        help="Number of files to check in paralel, defaults to all available CPUs",
 								    )
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    args = parser.parse_args()
-												Explicitly fail if dict is missing.

											
										
										
											2023-04-10 14:58:31 +00:00
+								    if args.personal_dict is not None and not args.personal_dict.exists():
 								        print(f"Error: dictionary {str(args.personal_dict)!r} not found.")
 								        sys.exit(1)
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								    if args.drop_capitalized and args.no_drop_capitalized:
 								        print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
 								        parser.print_help()
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								        sys.exit(1)
-												Allow using only --glob without further po_files (#31)

At the moment pospell complains if invoked with a --glob pattern but
without any other po_files in the command line. This is a problem only
with the check, as the code is ready to handle the situation. To bypass
this problem, one *needs* to pass a po_file in the command-line as well,
even if the glob pattern contains it.

This commit adjusts the condition that checks that input files have been
somehow specified to consider --glob as a source of input files.

Signed-off-by: Rodrigo Tobar <rtobar@icrar.org>
											
										
										
											2021-11-26 09:27:05 +00:00
+								    if not args.po_file and not args.modified and not args.glob:
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								        parser.print_help()
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								        sys.exit(1)
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    return args
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								def look_like_a_word(word):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Return True if the given str looks like a word.
 								    Used to filter out non-words like `---` or `-0700` so they don't
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								    get reported. They typically are not errors.
 								    """
 								    if not word:
 								        return False
 								    if any(digit in word for digit in digits):
 								        return False
 								    if len([c for c in word if category(c) == "Lu"]) > 1:
 								        return False  # Probably an accronym, or a name like CPython, macOS, SQLite, ...
 								    if "-" in word:
 								        return False
 								    return True
-												Bump min Python version to 3.7.

Because I do no longer have a 3.6 on my machine to test it.

											
										
										
											2023-11-20 10:57:44 +00:00
+								def run_hunspell(language, personal_dict, input_lines) -> List[Error]:
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								    """Run hunspell over the given input lines."""
 								    personal_dict_arg = ["-p", personal_dict] if personal_dict else []
 								    try:
 								        output = subprocess.check_output(
 								            ["hunspell", "-d", language, "-a"] + personal_dict_arg,
 								            universal_newlines=True,
 								            input=quote_for_hunspell(text for _, _, text in input_lines),
 								        )
 								    except subprocess.CalledProcessError:
-												Explicitly fail if dict is missing.

											
										
										
											2023-04-10 14:58:31 +00:00
+								        return []
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								    return parse_hunspell_output(input_lines, output.splitlines())
 								def flatten(list_of_lists):
 								    """[[a,b,c], [d,e,f]] -> [a,b,c,d,e,f]."""
 								    return [element for a_list in list_of_lists for element in a_list]
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								def spell_check(
-												Fix compounding error causing false negatives, hope it won't raise false positives.

											
										
										
											2020-10-11 14:04:26 +00:00
+								    po_files,
 								    personal_dict=None,
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								    language="en_US",
-												Fix compounding error causing false negatives, hope it won't raise false positives.

											
										
										
											2020-10-11 14:04:26 +00:00
+								    drop_capitalized=False,
 								    debug_only=False,
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								    jobs=os.cpu_count(),
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								):
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Check for spelling mistakes in the given po_files.
 								    (po format, containing restructuredtext), for the given language.
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    personal_dict allow to pass a personal dict (-p) option, to hunspell.
 								    Debug only will show what's passed to Hunspell instead of passing it.
-												Adding --version.

											
										
										
											2018-07-28 22:58:20 +00:00
+								    """
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								    # Pool.__exit__ calls terminate() instead of close(), we need the latter,
 								    # which ensures the processes' atexit handlers execute fully, which in
 								    # turn lets coverage write the sub-processes' coverage information
 								    pool = multiprocessing.Pool(jobs)  # pylint: disable=consider-using-with
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								    try:
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								        input_lines = flatten(
 								            pool.map(
 								                functools.partial(po_to_text, drop_capitalized=drop_capitalized),
 								                po_files,
 								            )
-												Faster implementation (~twice faster on python-docs-fr).

											
										
										
											2020-10-11 21:00:30 +00:00
+								        )
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								        if debug_only:
 								            for filename, line, text in input_lines:
 								                print(filename, line, text, sep=":")
 								            return 0
 								        if not input_lines:
 								            return 0
 								        # Distribute input lines across workers
 								        lines_per_job = (len(input_lines) + jobs - 1) // jobs
 								        chunked_inputs = [
 								            input_lines[i : i + lines_per_job]
 								            for i in range(0, len(input_lines), lines_per_job)
 								        ]
 								        errors = flatten(
 								            pool.map(
 								                functools.partial(run_hunspell, language, personal_dict),
 								                chunked_inputs,
 								            )
 								        )
 								    finally:
 								        pool.close()
 								        pool.join()
 								    for error in errors:
 								        print(*error, sep=":")
 								    return len(errors)
-												Bump min Python version to 3.7.

Because I do no longer have a 3.6 on my machine to test it.

											
										
										
											2023-11-20 10:57:44 +00:00
+								def parse_hunspell_output(inputs, outputs) -> List[Error]:
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								    """Parse `hunspell -a` output and collect all errors."""
 								    # skip first line of hunspell output (it's the banner)
 								    outputs = iter(outputs[1:])
 								    errors = []
 								    for po_input_line, output_line in zip(inputs, outputs):
 								        if not po_input_line.text:
 								            continue
 								        while output_line:
 								            if output_line.startswith("&"):
 								                _, original, *_ = output_line.split()
 								                if look_like_a_word(original):
 								                    errors.append(
 								                        (po_input_line.filename, po_input_line.line, original)
 								                    )
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								            try:
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								                output_line = next(outputs)
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								            except StopIteration:
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								                break
 								    return errors
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
+								def gracefull_handling_of_missing_dicts(language):
-												Bump black.

											
										
										
											2020-10-11 13:33:09 +00:00
+								    """Check if hunspell dictionary for given language is installed."""
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
+								    hunspell_dash_d = subprocess.check_output(
 								        ["hunspell", "-D"], universal_newlines=True, stderr=subprocess.STDOUT
 								    )
 								    languages = {Path(line).name for line in hunspell_dash_d}
 								    def error(*args, file=sys.stderr, **kwargs):
 								        print(*args, file=file, **kwargs)
 								    if language in languages:
 								        return
 								    error(
 								        "The hunspell dictionary for your language is missing, please install it.",
 								        end="\n\n",
 								    )
 								    if which("apt"):
 								        error("Maybe try something like:")
-												Pleases pylint and mypy.

											
										
										
											2021-10-27 15:22:08 +00:00
+								        error(f"  sudo apt install hunspell-{language}")
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
+								    else:
 								        error(
-												Pleases pylint and mypy.

											
										
										
											2021-10-27 15:22:08 +00:00
+								            f"""I don't know your environment, but I bet the package name looks like:
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
 								    hunspell-{language}
 								If you find it, please tell me (by opening an issue or a PR on
 								https://github.com/JulienPalard/pospell/) so I can enhance this error message.
-												Pleases pylint and mypy.

											
										
										
											2021-10-27 15:22:08 +00:00
+								"""
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
+								        )
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								    sys.exit(1)
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								def main():
-												Tox and github actions. (#24)


											
										
										
											2020-11-23 13:26:34 +00:00
+								    """Entry point (for command-line)."""
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    args = parse_args()
 								    logging.basicConfig(level=50 - 10 * args.verbose)
-												Add default values for capitalized words droppings, and add --no-drop-capitalized.

											
										
										
											2019-10-16 14:55:46 +00:00
+								    default_drop_capitalized = DEFAULT_DROP_CAPITALIZED.get(args.language, False)
 								    if args.drop_capitalized:
 								        drop_capitalized = True
 								    elif args.no_drop_capitalized:
 								        drop_capitalized = False
 								    else:
 								        drop_capitalized = default_drop_capitalized
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								    args.po_file = list(
 								        chain(Path(".").glob(args.glob) if args.glob else [], args.po_file)
 								    )
 								    if args.modified:
 								        git_status = subprocess.check_output(
-												Ignore deleted files using '--modified' option (#28)


											
										
										
											2021-04-13 16:33:44 +00:00
+								            ["git", "status", "--porcelain", "--no-renames"], encoding="utf-8"
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								        )
 								        git_status_lines = [
 								            line.split(maxsplit=2) for line in git_status.split("\n") if line
 								        ]
 								        args.po_file.extend(
 								            Path(filename)
 								            for status, filename in git_status_lines
-												Ignore deleted files using '--modified' option (#28)


											
										
										
											2021-04-13 16:33:44 +00:00
+								            if filename.endswith(".po") and status != "D"
-												Implement '--modified' option, to check spell only on modified files according to git.

											
										
										
											2018-12-24 14:49:54 +00:00
+								        )
-												Handle file opening errors. Closes #18.

Co-authored-by: Christophe Nanteuil <christophe.nanteuil@gmail.com>

											
										
										
											2020-10-13 22:44:05 +00:00
+								    try:
 								        errors = spell_check(
 								            args.po_file,
 								            args.personal_dict,
 								            args.language,
 								            drop_capitalized,
 								            args.debug,
-												Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
											
										
										
											2021-11-26 09:26:35 +00:00
+								            args.jobs,
-												Handle file opening errors. Closes #18.

Co-authored-by: Christophe Nanteuil <christophe.nanteuil@gmail.com>

											
										
										
											2020-10-13 22:44:05 +00:00
+								        )
 								    except POSpellException as err:
 								        print(err, file=sys.stderr)
 								        sys.exit(-1)
-												Gracefull handling of missing dicts. (#11)


											
										
										
											2019-12-10 14:10:17 +00:00
+								    if errors == -1:
 								        gracefull_handling_of_missing_dicts(args.language)
-												Using hunspell -a instead of hunspell -l to ensure we report the error at the right line.

											
										
										
											2020-10-12 16:09:26 +00:00
+								    sys.exit(0 if errors == 0 else -1)
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
-												Allow for multiple files to be passed positionally.

											
										
										
											2018-07-27 13:54:10 +00:00
+								if __name__ == "__main__":
-												Initial commit

											
										
										
											2018-07-23 15:37:50 +00:00
+								    main()