Refactor pospell to use multiprocessing (#32)

One of the main drawbacks of pospell at the moment is that checking is
performed serially by a single hunspell process. In small projects this
is not noticeable, but in slightly bigger ones this can go up a bit
(e.g., in python-docs-es it takes ~2 minutes to check the whole set of
.po files).

The obvious solution to speed things up is to use multiprocessing,
parallelising the process at two different places: first, when reading
the input .po files and collecting the input strings to feed into
hunspell, and secondly when running hunspell itself.

This commit implements this support. It works as follows:

 * A new namedtuple called input_line has been added. It contains a
   filename, a line, and text, and thus it uniquely identifies an input
   line in a self-contained way.
 * When collecting input to feed into hunspell, the po_to_text routine
   collects input_lines instead of a simple string. This is done with a
   multiprocessing Pool to run in parallel across all input files.
 * The input_lines are split in N blocks, with N being the size of the
   pool. Note that during this process input_lines from different files
   might end up in the same block, and input_lines from the same file
   might end up in different blocks; however since input_lines are
   self-contained we are not losing information.
 * N hunspell instances are run over the N blocks of input_lines using
   the pool (only the text field from the input_lines is fed into
   hunspell).
 * When interpreting errors from hunspell we can match an input_line
   with its corresponding hunspell output lines, and thus can identify
   the original file:line that caused the error.

The multiprocessing pool is sized via a new -j/--jobs command line
option, which defaults to os.cpu_count() to run at maximum speed by
default.

These are the kind of differences I see with python-docs-es in my
machine, so YMMV depending on your setup/project:

$> time pospell -p dict2.txt -l es_ES */*.po -j 1
real    2m1.859s
user    2m6.680s
sys     0m3.829s

$> time pospell -p dict2.txt -l es_ES */*.po -j 2
real    1m10.322s
user    2m18.210s
sys     0m3.559s

Finally, these changes had some minor effects on the tooling around
testing. Pylint complained about there being too many arguments now in
check_spell, so pylint's max-args settings has been adjusted as
discussed. Separately, coverage information now needs to be collected
for sub-processes of the test main process; this is automatically done
by the pytest-cov plug-in, so I've switched tox to use that rather than
the more manual running of pytest under coverage (which would otherwise
require some extra setup to account for subprocesses).
This commit is contained in:
rtobar 2021-11-26 17:26:35 +08:00 committed by GitHub
parent 8b0d6d8778
commit 3553ecd726
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 103 additions and 61 deletions

2
.pylintrc Normal file
View File

@ -0,0 +1,2 @@
[DESIGN]
max-args = 6

View File

@ -2,10 +2,14 @@
import io
from string import digits
from unicodedata import category
import collections
import functools
import logging
import multiprocessing
import os
import subprocess
import sys
from typing import Dict
from typing import List, Tuple
from contextlib import redirect_stderr
from itertools import chain
from pathlib import Path
@ -25,6 +29,9 @@ __version__ = "1.0.13"
DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
input_line = collections.namedtuple("input_line", "filename line text")
class POSpellException(Exception):
"""All exceptions from this module inherit from this one."""
@ -202,7 +209,7 @@ def quote_for_hunspell(text):
against future changes in hunspell.
"""
out = []
for line in text.split("\n"):
for line in text:
out.append("^" + line if line else "")
return "\n".join(out)
@ -213,7 +220,7 @@ def po_to_text(po_path, drop_capitalized=False):
This strips the msgids and all po syntax while keeping lines at
their same position / line number.
"""
buffer = []
input_lines = []
lines = 0
try:
entries = polib.pofile(Path(po_path).read_text(encoding="UTF-8"))
@ -223,11 +230,17 @@ def po_to_text(po_path, drop_capitalized=False):
if entry.msgid == entry.msgstr:
continue
while lines < entry.linenum:
buffer.append("")
lines += 1
buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
input_lines.append(input_line(po_path, lines, ""))
lines += 1
return "\n".join(buffer)
input_lines.append(
input_line(
po_path,
lines,
clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path),
)
)
return input_lines
def parse_args():
@ -287,6 +300,13 @@ def parse_args():
parser.add_argument(
"--modified", "-m", action="store_true", help="Use git to find modified files."
)
parser.add_argument(
"-j",
"--jobs",
type=int,
default=os.cpu_count(),
help="Number of files to check in paralel, defaults to all available CPUs",
)
args = parser.parse_args()
if args.drop_capitalized and args.no_drop_capitalized:
print("Error: don't provide both --drop-capitalized AND --no-drop-capitalized.")
@ -315,12 +335,32 @@ def look_like_a_word(word):
return True
def run_hunspell(language, personal_dict, input_lines):
"""Run hunspell over the given input lines."""
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
try:
output = subprocess.check_output(
["hunspell", "-d", language, "-a"] + personal_dict_arg,
universal_newlines=True,
input=quote_for_hunspell(text for _, _, text in input_lines),
)
except subprocess.CalledProcessError:
return -1
return parse_hunspell_output(input_lines, output.splitlines())
def flatten(list_of_lists):
"""[[a,b,c], [d,e,f]] -> [a,b,c,d,e,f]."""
return [element for a_list in list_of_lists for element in a_list]
def spell_check(
po_files,
personal_dict=None,
language="en_US",
drop_capitalized=False,
debug_only=False,
jobs=os.cpu_count(),
):
"""Check for spelling mistakes in the given po_files.
@ -329,67 +369,65 @@ def spell_check(
Debug only will show what's passed to Hunspell instead of passing it.
"""
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
texts_for_hunspell = {}
for po_file in po_files:
if debug_only:
print(po_to_text(str(po_file), drop_capitalized))
continue
texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized)
if debug_only:
return 0
# Pool.__exit__ calls terminate() instead of close(), we need the latter,
# which ensures the processes' atexit handlers execute fully, which in
# turn lets coverage write the sub-processes' coverage information
pool = multiprocessing.Pool(jobs) # pylint: disable=consider-using-with
try:
output = subprocess.run(
["hunspell", "-d", language, "-a"] + personal_dict_arg,
universal_newlines=True,
input=quote_for_hunspell("\n".join(texts_for_hunspell.values())),
stdout=subprocess.PIPE,
check=True,
input_lines = flatten(
pool.map(
functools.partial(po_to_text, drop_capitalized=drop_capitalized),
po_files,
)
)
except subprocess.CalledProcessError:
return -1
return parse_hunspell_output(texts_for_hunspell, output)
if debug_only:
for filename, line, text in input_lines:
print(filename, line, text, sep=":")
return 0
if not input_lines:
return 0
# Distribute input lines across workers
lines_per_job = (len(input_lines) + jobs - 1) // jobs
chunked_inputs = [
input_lines[i : i + lines_per_job]
for i in range(0, len(input_lines), lines_per_job)
]
errors = flatten(
pool.map(
functools.partial(run_hunspell, language, personal_dict),
chunked_inputs,
)
)
finally:
pool.close()
pool.join()
for error in errors:
print(*error, sep=":")
return len(errors)
def parse_hunspell_output(hunspell_input: Dict[str, str], hunspell_output) -> int:
"""Parse `hunspell -a` output.
Print one line per error on stderr, of the following format:
FILE:LINE:ERROR
Returns the number of errors.
hunspell_input contains a dict of files: all_lines_for_this_file.
"""
errors = 0
checked_files = iter(hunspell_input.items())
checked_file_name, checked_text = next(checked_files)
checked_lines = iter(checked_text.split("\n"))
next(checked_lines)
current_line_number = 1
for line in hunspell_output.stdout.split("\n")[1:]:
if not line:
def parse_hunspell_output(inputs, outputs) -> List[Tuple[str, int, str]]:
"""Parse `hunspell -a` output and collect all errors."""
# skip first line of hunspell output (it's the banner)
outputs = iter(outputs[1:])
errors = []
for po_input_line, output_line in zip(inputs, outputs):
if not po_input_line.text:
continue
while output_line:
if output_line.startswith("&"):
_, original, *_ = output_line.split()
if look_like_a_word(original):
errors.append(
(po_input_line.filename, po_input_line.line, original)
)
try:
next(checked_lines)
current_line_number += 1
output_line = next(outputs)
except StopIteration:
try:
checked_file_name, checked_text = next(checked_files)
checked_lines = iter(checked_text.split("\n"))
next(checked_lines)
current_line_number = 1
except StopIteration:
return errors
continue
if line == "*": # OK
continue
if line[0] == "&":
_, original, *_ = line.split()
if look_like_a_word(original):
print(checked_file_name, current_line_number, original, sep=":")
errors += 1
raise Unreachable("Got this one? I'm sorry, read XKCD 2200, then open an issue.")
break
return errors
def gracefull_handling_of_missing_dicts(language):
@ -457,6 +495,7 @@ def main():
args.language,
drop_capitalized,
args.debug,
args.jobs,
)
except POSpellException as err:
print(err, file=sys.stderr)

View File

@ -6,6 +6,7 @@ max-line-length = 88
[coverage:run]
; branch = true: would need a lot of pragma: no branch on infinite loops.
parallel = true
concurrency = multiprocessing
omit =
.tox/*