From 3553ecd726196e563b67777ad37930b6d8d82ba7 Mon Sep 17 00:00:00 2001 From: rtobar Date: Fri, 26 Nov 2021 17:26:35 +0800 Subject: [PATCH] Refactor pospell to use multiprocessing (#32) One of the main drawbacks of pospell at the moment is that checking is performed serially by a single hunspell process. In small projects this is not noticeable, but in slightly bigger ones this can go up a bit (e.g., in python-docs-es it takes ~2 minutes to check the whole set of .po files). The obvious solution to speed things up is to use multiprocessing, parallelising the process at two different places: first, when reading the input .po files and collecting the input strings to feed into hunspell, and secondly when running hunspell itself. This commit implements this support. It works as follows: * A new namedtuple called input_line has been added. It contains a filename, a line, and text, and thus it uniquely identifies an input line in a self-contained way. * When collecting input to feed into hunspell, the po_to_text routine collects input_lines instead of a simple string. This is done with a multiprocessing Pool to run in parallel across all input files. * The input_lines are split in N blocks, with N being the size of the pool. Note that during this process input_lines from different files might end up in the same block, and input_lines from the same file might end up in different blocks; however since input_lines are self-contained we are not losing information. * N hunspell instances are run over the N blocks of input_lines using the pool (only the text field from the input_lines is fed into hunspell). * When interpreting errors from hunspell we can match an input_line with its corresponding hunspell output lines, and thus can identify the original file:line that caused the error. The multiprocessing pool is sized via a new -j/--jobs command line option, which defaults to os.cpu_count() to run at maximum speed by default. These are the kind of differences I see with python-docs-es in my machine, so YMMV depending on your setup/project: $> time pospell -p dict2.txt -l es_ES */*.po -j 1 real 2m1.859s user 2m6.680s sys 0m3.829s $> time pospell -p dict2.txt -l es_ES */*.po -j 2 real 1m10.322s user 2m18.210s sys 0m3.559s Finally, these changes had some minor effects on the tooling around testing. Pylint complained about there being too many arguments now in check_spell, so pylint's max-args settings has been adjusted as discussed. Separately, coverage information now needs to be collected for sub-processes of the test main process; this is automatically done by the pytest-cov plug-in, so I've switched tox to use that rather than the more manual running of pytest under coverage (which would otherwise require some extra setup to account for subprocesses). --- .pylintrc | 2 + pospell.py | 161 +++++++++++++++++++++++++++++++++-------------------- tox.ini | 1 + 3 files changed, 103 insertions(+), 61 deletions(-) create mode 100644 .pylintrc diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..257572a --- /dev/null +++ b/.pylintrc @@ -0,0 +1,2 @@ +[DESIGN] +max-args = 6 diff --git a/pospell.py b/pospell.py index a15288e..69a53cf 100644 --- a/pospell.py +++ b/pospell.py @@ -2,10 +2,14 @@ import io from string import digits from unicodedata import category +import collections +import functools import logging +import multiprocessing +import os import subprocess import sys -from typing import Dict +from typing import List, Tuple from contextlib import redirect_stderr from itertools import chain from pathlib import Path @@ -25,6 +29,9 @@ __version__ = "1.0.13" DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True} +input_line = collections.namedtuple("input_line", "filename line text") + + class POSpellException(Exception): """All exceptions from this module inherit from this one.""" @@ -202,7 +209,7 @@ def quote_for_hunspell(text): against future changes in hunspell. """ out = [] - for line in text.split("\n"): + for line in text: out.append("^" + line if line else "") return "\n".join(out) @@ -213,7 +220,7 @@ def po_to_text(po_path, drop_capitalized=False): This strips the msgids and all po syntax while keeping lines at their same position / line number. """ - buffer = [] + input_lines = [] lines = 0 try: entries = polib.pofile(Path(po_path).read_text(encoding="UTF-8")) @@ -223,11 +230,17 @@ def po_to_text(po_path, drop_capitalized=False): if entry.msgid == entry.msgstr: continue while lines < entry.linenum: - buffer.append("") lines += 1 - buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path)) + input_lines.append(input_line(po_path, lines, "")) lines += 1 - return "\n".join(buffer) + input_lines.append( + input_line( + po_path, + lines, + clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path), + ) + ) + return input_lines def parse_args(): @@ -287,6 +300,13 @@ def parse_args(): parser.add_argument( "--modified", "-m", action="store_true", help="Use git to find modified files." ) + parser.add_argument( + "-j", + "--jobs", + type=int, + default=os.cpu_count(), + help="Number of files to check in paralel, defaults to all available CPUs", + ) args = parser.parse_args() if args.drop_capitalized and args.no_drop_capitalized: print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.") @@ -315,12 +335,32 @@ def look_like_a_word(word): return True +def run_hunspell(language, personal_dict, input_lines): + """Run hunspell over the given input lines.""" + personal_dict_arg = ["-p", personal_dict] if personal_dict else [] + try: + output = subprocess.check_output( + ["hunspell", "-d", language, "-a"] + personal_dict_arg, + universal_newlines=True, + input=quote_for_hunspell(text for _, _, text in input_lines), + ) + except subprocess.CalledProcessError: + return -1 + return parse_hunspell_output(input_lines, output.splitlines()) + + +def flatten(list_of_lists): + """[[a,b,c], [d,e,f]] -> [a,b,c,d,e,f].""" + return [element for a_list in list_of_lists for element in a_list] + + def spell_check( po_files, personal_dict=None, language="en_US", drop_capitalized=False, debug_only=False, + jobs=os.cpu_count(), ): """Check for spelling mistakes in the given po_files. @@ -329,67 +369,65 @@ def spell_check( Debug only will show what's passed to Hunspell instead of passing it. """ - personal_dict_arg = ["-p", personal_dict] if personal_dict else [] - texts_for_hunspell = {} - for po_file in po_files: - if debug_only: - print(po_to_text(str(po_file), drop_capitalized)) - continue - texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized) - if debug_only: - return 0 + # Pool.__exit__ calls terminate() instead of close(), we need the latter, + # which ensures the processes' atexit handlers execute fully, which in + # turn lets coverage write the sub-processes' coverage information + pool = multiprocessing.Pool(jobs) # pylint: disable=consider-using-with try: - output = subprocess.run( - ["hunspell", "-d", language, "-a"] + personal_dict_arg, - universal_newlines=True, - input=quote_for_hunspell("\n".join(texts_for_hunspell.values())), - stdout=subprocess.PIPE, - check=True, + input_lines = flatten( + pool.map( + functools.partial(po_to_text, drop_capitalized=drop_capitalized), + po_files, + ) ) - except subprocess.CalledProcessError: - return -1 - return parse_hunspell_output(texts_for_hunspell, output) + if debug_only: + for filename, line, text in input_lines: + print(filename, line, text, sep=":") + return 0 + if not input_lines: + return 0 + + # Distribute input lines across workers + lines_per_job = (len(input_lines) + jobs - 1) // jobs + chunked_inputs = [ + input_lines[i : i + lines_per_job] + for i in range(0, len(input_lines), lines_per_job) + ] + errors = flatten( + pool.map( + functools.partial(run_hunspell, language, personal_dict), + chunked_inputs, + ) + ) + finally: + pool.close() + pool.join() + + for error in errors: + print(*error, sep=":") + return len(errors) -def parse_hunspell_output(hunspell_input: Dict[str, str], hunspell_output) -> int: - """Parse `hunspell -a` output. - - Print one line per error on stderr, of the following format: - - FILE:LINE:ERROR - - Returns the number of errors. - - hunspell_input contains a dict of files: all_lines_for_this_file. - """ - errors = 0 - checked_files = iter(hunspell_input.items()) - checked_file_name, checked_text = next(checked_files) - checked_lines = iter(checked_text.split("\n")) - next(checked_lines) - current_line_number = 1 - for line in hunspell_output.stdout.split("\n")[1:]: - if not line: +def parse_hunspell_output(inputs, outputs) -> List[Tuple[str, int, str]]: + """Parse `hunspell -a` output and collect all errors.""" + # skip first line of hunspell output (it's the banner) + outputs = iter(outputs[1:]) + errors = [] + for po_input_line, output_line in zip(inputs, outputs): + if not po_input_line.text: + continue + while output_line: + if output_line.startswith("&"): + _, original, *_ = output_line.split() + if look_like_a_word(original): + errors.append( + (po_input_line.filename, po_input_line.line, original) + ) try: - next(checked_lines) - current_line_number += 1 + output_line = next(outputs) except StopIteration: - try: - checked_file_name, checked_text = next(checked_files) - checked_lines = iter(checked_text.split("\n")) - next(checked_lines) - current_line_number = 1 - except StopIteration: - return errors - continue - if line == "*": # OK - continue - if line[0] == "&": - _, original, *_ = line.split() - if look_like_a_word(original): - print(checked_file_name, current_line_number, original, sep=":") - errors += 1 - raise Unreachable("Got this one? I'm sorry, read XKCD 2200, then open an issue.") + break + return errors def gracefull_handling_of_missing_dicts(language): @@ -457,6 +495,7 @@ def main(): args.language, drop_capitalized, args.debug, + args.jobs, ) except POSpellException as err: print(err, file=sys.stderr) diff --git a/tox.ini b/tox.ini index a34f368..5673141 100644 --- a/tox.ini +++ b/tox.ini @@ -6,6 +6,7 @@ max-line-length = 88 [coverage:run] ; branch = true: would need a lot of pragma: no branch on infinite loops. parallel = true +concurrency = multiprocessing omit = .tox/*