Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is performed serially by a single hunspell process. In small projects this is not noticeable, but in slightly bigger ones this can go up a bit (e.g., in python-docs-es it takes ~2 minutes to check the whole set of .po files). The obvious solution to speed things up is to use multiprocessing, parallelising the process at two different places: first, when reading the input .po files and collecting the input strings to feed into hunspell, and secondly when running hunspell itself. This commit implements this support. It works as follows: * A new namedtuple called input_line has been added. It contains a filename, a line, and text, and thus it uniquely identifies an input line in a self-contained way. * When collecting input to feed into hunspell, the po_to_text routine collects input_lines instead of a simple string. This is done with a multiprocessing Pool to run in parallel across all input files. * The input_lines are split in N blocks, with N being the size of the pool. Note that during this process input_lines from different files might end up in the same block, and input_lines from the same file might end up in different blocks; however since input_lines are self-contained we are not losing information. * N hunspell instances are run over the N blocks of input_lines using the pool (only the text field from the input_lines is fed into hunspell). * When interpreting errors from hunspell we can match an input_line with its corresponding hunspell output lines, and thus can identify the original file:line that caused the error. The multiprocessing pool is sized via a new -j/--jobs command line option, which defaults to os.cpu_count() to run at maximum speed by default. These are the kind of differences I see with python-docs-es in my machine, so YMMV depending on your setup/project: $> time pospell -p dict2.txt -l es_ES */*.po -j 1 real 2m1.859s user 2m6.680s sys 0m3.829s $> time pospell -p dict2.txt -l es_ES */*.po -j 2 real 1m10.322s user 2m18.210s sys 0m3.559s Finally, these changes had some minor effects on the tooling around testing. Pylint complained about there being too many arguments now in check_spell, so pylint's max-args settings has been adjusted as discussed. Separately, coverage information now needs to be collected for sub-processes of the test main process; this is automatically done by the pytest-cov plug-in, so I've switched tox to use that rather than the more manual running of pytest under coverage (which would otherwise require some extra setup to account for subprocesses).
2021-11-26 17:26:35 +08:00 · 2021-11-26 17:26:35 +08:00 · 3553ecd726
parent 8b0d6d8778
commit 3553ecd726
3 changed files with 103 additions and 61 deletions
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,2 @@
+[DESIGN]
+max-args = 6
--- a/pospell.py
+++ b/pospell.py
@ -2,10 +2,14 @@
 import io
 from string import digits
 from unicodedata import category
+import collections
+import functools
 import logging
+import multiprocessing
+import os
 import subprocess
 import sys
-from typing import Dict
+from typing import List, Tuple
 from contextlib import redirect_stderr
 from itertools import chain
 from pathlib import Path
@ -25,6 +29,9 @@ __version__ = "1.0.13"
 DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}


+input_line = collections.namedtuple("input_line", "filename line text")
+
+
 class POSpellException(Exception):
    """All exceptions from this module inherit from this one."""

@ -202,7 +209,7 @@ def quote_for_hunspell(text):
    against future changes in hunspell.
    """
    out = []
-    for line in text.split("\n"):
+    for line in text:
        out.append("^" + line if line else "")
    return "\n".join(out)

@ -213,7 +220,7 @@ def po_to_text(po_path, drop_capitalized=False):
    This strips the msgids and all po syntax while keeping lines at
    their same position / line number.
    """
-    buffer = []
+    input_lines = []
    lines = 0
    try:
        entries = polib.pofile(Path(po_path).read_text(encoding="UTF-8"))
@ -223,11 +230,17 @@ def po_to_text(po_path, drop_capitalized=False):
        if entry.msgid == entry.msgstr:
            continue
        while lines < entry.linenum:
-            buffer.append("")
            lines += 1
-        buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
+            input_lines.append(input_line(po_path, lines, ""))
        lines += 1
-    return "\n".join(buffer)
+        input_lines.append(
+            input_line(
+                po_path,
+                lines,
+                clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path),
+            )
+        )
+    return input_lines


 def parse_args():
@ -287,6 +300,13 @@ def parse_args():
    parser.add_argument(
        "--modified", "-m", action="store_true", help="Use git to find modified files."
    )
+    parser.add_argument(
+        "-j",
+        "--jobs",
+        type=int,
+        default=os.cpu_count(),
+        help="Number of files to check in paralel, defaults to all available CPUs",
+    )
    args = parser.parse_args()
    if args.drop_capitalized and args.no_drop_capitalized:
        print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
@ -315,12 +335,32 @@ def look_like_a_word(word):
    return True


+def run_hunspell(language, personal_dict, input_lines):
+    """Run hunspell over the given input lines."""
+    personal_dict_arg = ["-p", personal_dict] if personal_dict else []
+    try:
+        output = subprocess.check_output(
+            ["hunspell", "-d", language, "-a"] + personal_dict_arg,
+            universal_newlines=True,
+            input=quote_for_hunspell(text for _, _, text in input_lines),
+        )
+    except subprocess.CalledProcessError:
+        return -1
+    return parse_hunspell_output(input_lines, output.splitlines())
+
+
+def flatten(list_of_lists):
+    """[[a,b,c], [d,e,f]] -> [a,b,c,d,e,f]."""
+    return [element for a_list in list_of_lists for element in a_list]
+
+
 def spell_check(
    po_files,
    personal_dict=None,
    language="en_US",
    drop_capitalized=False,
    debug_only=False,
+    jobs=os.cpu_count(),
 ):
    """Check for spelling mistakes in the given po_files.

@ -329,67 +369,65 @@ def spell_check(

    Debug only will show what's passed to Hunspell instead of passing it.
    """
-    personal_dict_arg = ["-p", personal_dict] if personal_dict else []
-    texts_for_hunspell = {}
-    for po_file in po_files:
-        if debug_only:
-            print(po_to_text(str(po_file), drop_capitalized))
-            continue
-        texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized)
-    if debug_only:
-        return 0
+    # Pool.__exit__ calls terminate() instead of close(), we need the latter,
+    # which ensures the processes' atexit handlers execute fully, which in
+    # turn lets coverage write the sub-processes' coverage information
+    pool = multiprocessing.Pool(jobs)  # pylint: disable=consider-using-with
    try:
-        output = subprocess.run(
-            ["hunspell", "-d", language, "-a"] + personal_dict_arg,
-            universal_newlines=True,
-            input=quote_for_hunspell("\n".join(texts_for_hunspell.values())),
-            stdout=subprocess.PIPE,
-            check=True,
+        input_lines = flatten(
+            pool.map(
+                functools.partial(po_to_text, drop_capitalized=drop_capitalized),
+                po_files,
+            )
        )
-    except subprocess.CalledProcessError:
-        return -1
-    return parse_hunspell_output(texts_for_hunspell, output)
+        if debug_only:
+            for filename, line, text in input_lines:
+                print(filename, line, text, sep=":")
+            return 0
+        if not input_lines:
+            return 0
+
+        # Distribute input lines across workers
+        lines_per_job = (len(input_lines) + jobs - 1) // jobs
+        chunked_inputs = [
+            input_lines[i : i + lines_per_job]
+            for i in range(0, len(input_lines), lines_per_job)
+        ]
+        errors = flatten(
+            pool.map(
+                functools.partial(run_hunspell, language, personal_dict),
+                chunked_inputs,
+            )
+        )
+    finally:
+        pool.close()
+        pool.join()
+
+    for error in errors:
+        print(*error, sep=":")
+    return len(errors)


-def parse_hunspell_output(hunspell_input: Dict[str, str], hunspell_output) -> int:
-    """Parse `hunspell -a` output.
-
-    Print one line per error on stderr, of the following format:
-
-        FILE:LINE:ERROR
-
-    Returns the number of errors.
-
-    hunspell_input contains a dict of files: all_lines_for_this_file.
-    """
-    errors = 0
-    checked_files = iter(hunspell_input.items())
-    checked_file_name, checked_text = next(checked_files)
-    checked_lines = iter(checked_text.split("\n"))
-    next(checked_lines)
-    current_line_number = 1
-    for line in hunspell_output.stdout.split("\n")[1:]:
-        if not line:
+def parse_hunspell_output(inputs, outputs) -> List[Tuple[str, int, str]]:
+    """Parse `hunspell -a` output and collect all errors."""
+    # skip first line of hunspell output (it's the banner)
+    outputs = iter(outputs[1:])
+    errors = []
+    for po_input_line, output_line in zip(inputs, outputs):
+        if not po_input_line.text:
+            continue
+        while output_line:
+            if output_line.startswith("&"):
+                _, original, *_ = output_line.split()
+                if look_like_a_word(original):
+                    errors.append(
+                        (po_input_line.filename, po_input_line.line, original)
+                    )
            try:
-                next(checked_lines)
-                current_line_number += 1
+                output_line = next(outputs)
            except StopIteration:
-                try:
-                    checked_file_name, checked_text = next(checked_files)
-                    checked_lines = iter(checked_text.split("\n"))
-                    next(checked_lines)
-                    current_line_number = 1
-                except StopIteration:
-                    return errors
-            continue
-        if line == "*":  # OK
-            continue
-        if line[0] == "&":
-            _, original, *_ = line.split()
-            if look_like_a_word(original):
-                print(checked_file_name, current_line_number, original, sep=":")
-                errors += 1
-    raise Unreachable("Got this one? I'm sorry, read XKCD 2200, then open an issue.")
+                break
+    return errors


 def gracefull_handling_of_missing_dicts(language):
@ -457,6 +495,7 @@ def main():
            args.language,
            drop_capitalized,
            args.debug,
+            args.jobs,
        )
    except POSpellException as err:
        print(err, file=sys.stderr)
--- a/tox.ini
+++ b/tox.ini
@ -6,6 +6,7 @@ max-line-length = 88
 [coverage:run]
 ; branch = true: would need a lot of pragma: no branch on infinite loops.
 parallel = true
+concurrency = multiprocessing
 omit =
  .tox/*