From 3553ecd726196e563b67777ad37930b6d8d82ba7 Mon Sep 17 00:00:00 2001
From: rtobar <rtobarc@gmail.com>
Date: Fri, 26 Nov 2021 17:26:35 +0800
Subject: [PATCH] Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
---
 .pylintrc  |   2 +
 pospell.py | 161 +++++++++++++++++++++++++++++++++--------------------
 tox.ini    |   1 +
 3 files changed, 103 insertions(+), 61 deletions(-)
 create mode 100644 .pylintrc

diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000..257572a
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,2 @@
+[DESIGN]
+max-args = 6
diff --git a/pospell.py b/pospell.py
index a15288e..69a53cf 100644
--- a/pospell.py
+++ b/pospell.py
@@ -2,10 +2,14 @@
 import io
 from string import digits
 from unicodedata import category
+import collections
+import functools
 import logging
+import multiprocessing
+import os
 import subprocess
 import sys
-from typing import Dict
+from typing import List, Tuple
 from contextlib import redirect_stderr
 from itertools import chain
 from pathlib import Path
@@ -25,6 +29,9 @@ __version__ = "1.0.13"
 DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
 
 
+input_line = collections.namedtuple("input_line", "filename line text")
+
+
 class POSpellException(Exception):
     """All exceptions from this module inherit from this one."""
 
@@ -202,7 +209,7 @@ def quote_for_hunspell(text):
     against future changes in hunspell.
     """
     out = []
-    for line in text.split("\n"):
+    for line in text:
         out.append("^" + line if line else "")
     return "\n".join(out)
 
@@ -213,7 +220,7 @@ def po_to_text(po_path, drop_capitalized=False):
     This strips the msgids and all po syntax while keeping lines at
     their same position / line number.
     """
-    buffer = []
+    input_lines = []
     lines = 0
     try:
         entries = polib.pofile(Path(po_path).read_text(encoding="UTF-8"))
@@ -223,11 +230,17 @@ def po_to_text(po_path, drop_capitalized=False):
         if entry.msgid == entry.msgstr:
             continue
         while lines < entry.linenum:
-            buffer.append("")
             lines += 1
-        buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
+            input_lines.append(input_line(po_path, lines, ""))
         lines += 1
-    return "\n".join(buffer)
+        input_lines.append(
+            input_line(
+                po_path,
+                lines,
+                clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path),
+            )
+        )
+    return input_lines
 
 
 def parse_args():
@@ -287,6 +300,13 @@ def parse_args():
     parser.add_argument(
         "--modified", "-m", action="store_true", help="Use git to find modified files."
     )
+    parser.add_argument(
+        "-j",
+        "--jobs",
+        type=int,
+        default=os.cpu_count(),
+        help="Number of files to check in paralel, defaults to all available CPUs",
+    )
     args = parser.parse_args()
     if args.drop_capitalized and args.no_drop_capitalized:
         print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
@@ -315,12 +335,32 @@ def look_like_a_word(word):
     return True
 
 
+def run_hunspell(language, personal_dict, input_lines):
+    """Run hunspell over the given input lines."""
+    personal_dict_arg = ["-p", personal_dict] if personal_dict else []
+    try:
+        output = subprocess.check_output(
+            ["hunspell", "-d", language, "-a"] + personal_dict_arg,
+            universal_newlines=True,
+            input=quote_for_hunspell(text for _, _, text in input_lines),
+        )
+    except subprocess.CalledProcessError:
+        return -1
+    return parse_hunspell_output(input_lines, output.splitlines())
+
+
+def flatten(list_of_lists):
+    """[[a,b,c], [d,e,f]] -> [a,b,c,d,e,f]."""
+    return [element for a_list in list_of_lists for element in a_list]
+
+
 def spell_check(
     po_files,
     personal_dict=None,
     language="en_US",
     drop_capitalized=False,
     debug_only=False,
+    jobs=os.cpu_count(),
 ):
     """Check for spelling mistakes in the given po_files.
 
@@ -329,67 +369,65 @@ def spell_check(
 
     Debug only will show what's passed to Hunspell instead of passing it.
     """
-    personal_dict_arg = ["-p", personal_dict] if personal_dict else []
-    texts_for_hunspell = {}
-    for po_file in po_files:
-        if debug_only:
-            print(po_to_text(str(po_file), drop_capitalized))
-            continue
-        texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized)
-    if debug_only:
-        return 0
+    # Pool.__exit__ calls terminate() instead of close(), we need the latter,
+    # which ensures the processes' atexit handlers execute fully, which in
+    # turn lets coverage write the sub-processes' coverage information
+    pool = multiprocessing.Pool(jobs)  # pylint: disable=consider-using-with
     try:
-        output = subprocess.run(
-            ["hunspell", "-d", language, "-a"] + personal_dict_arg,
-            universal_newlines=True,
-            input=quote_for_hunspell("\n".join(texts_for_hunspell.values())),
-            stdout=subprocess.PIPE,
-            check=True,
+        input_lines = flatten(
+            pool.map(
+                functools.partial(po_to_text, drop_capitalized=drop_capitalized),
+                po_files,
+            )
         )
-    except subprocess.CalledProcessError:
-        return -1
-    return parse_hunspell_output(texts_for_hunspell, output)
+        if debug_only:
+            for filename, line, text in input_lines:
+                print(filename, line, text, sep=":")
+            return 0
+        if not input_lines:
+            return 0
+
+        # Distribute input lines across workers
+        lines_per_job = (len(input_lines) + jobs - 1) // jobs
+        chunked_inputs = [
+            input_lines[i : i + lines_per_job]
+            for i in range(0, len(input_lines), lines_per_job)
+        ]
+        errors = flatten(
+            pool.map(
+                functools.partial(run_hunspell, language, personal_dict),
+                chunked_inputs,
+            )
+        )
+    finally:
+        pool.close()
+        pool.join()
+
+    for error in errors:
+        print(*error, sep=":")
+    return len(errors)
 
 
-def parse_hunspell_output(hunspell_input: Dict[str, str], hunspell_output) -> int:
-    """Parse `hunspell -a` output.
-
-    Print one line per error on stderr, of the following format:
-
-        FILE:LINE:ERROR
-
-    Returns the number of errors.
-
-    hunspell_input contains a dict of files: all_lines_for_this_file.
-    """
-    errors = 0
-    checked_files = iter(hunspell_input.items())
-    checked_file_name, checked_text = next(checked_files)
-    checked_lines = iter(checked_text.split("\n"))
-    next(checked_lines)
-    current_line_number = 1
-    for line in hunspell_output.stdout.split("\n")[1:]:
-        if not line:
+def parse_hunspell_output(inputs, outputs) -> List[Tuple[str, int, str]]:
+    """Parse `hunspell -a` output and collect all errors."""
+    # skip first line of hunspell output (it's the banner)
+    outputs = iter(outputs[1:])
+    errors = []
+    for po_input_line, output_line in zip(inputs, outputs):
+        if not po_input_line.text:
+            continue
+        while output_line:
+            if output_line.startswith("&"):
+                _, original, *_ = output_line.split()
+                if look_like_a_word(original):
+                    errors.append(
+                        (po_input_line.filename, po_input_line.line, original)
+                    )
             try:
-                next(checked_lines)
-                current_line_number += 1
+                output_line = next(outputs)
             except StopIteration:
-                try:
-                    checked_file_name, checked_text = next(checked_files)
-                    checked_lines = iter(checked_text.split("\n"))
-                    next(checked_lines)
-                    current_line_number = 1
-                except StopIteration:
-                    return errors
-            continue
-        if line == "*":  # OK
-            continue
-        if line[0] == "&":
-            _, original, *_ = line.split()
-            if look_like_a_word(original):
-                print(checked_file_name, current_line_number, original, sep=":")
-                errors += 1
-    raise Unreachable("Got this one? I'm sorry, read XKCD 2200, then open an issue.")
+                break
+    return errors
 
 
 def gracefull_handling_of_missing_dicts(language):
@@ -457,6 +495,7 @@ def main():
             args.language,
             drop_capitalized,
             args.debug,
+            args.jobs,
         )
     except POSpellException as err:
         print(err, file=sys.stderr)
diff --git a/tox.ini b/tox.ini
index a34f368..5673141 100644
--- a/tox.ini
+++ b/tox.ini
@@ -6,6 +6,7 @@ max-line-length = 88
 [coverage:run]
 ; branch = true: would need a lot of pragma: no branch on infinite loops.
 parallel = true
+concurrency = multiprocessing
 omit =
   .tox/*