From 5db3a839cbaad9721f614015626e8e830e2f3367 Mon Sep 17 00:00:00 2001 From: Julien Palard Date: Sun, 11 Oct 2020 23:00:30 +0200 Subject: [PATCH] Faster implementation (~twice faster on python-docs-fr). --- pospell.py | 82 ++++++++++++++++------- tests/expected_to_fail/1.po | 2 + tests/expected_to_fail/issue19.po | 17 +++++ tests/expected_to_success/1.po | 2 + tests/expected_to_success/accronyms.po | 14 ++++ tests/expected_to_success/hour.po | 2 + tests/expected_to_success/versions.po | 2 + tests/test_pospell.py | 93 ++++---------------------- 8 files changed, 107 insertions(+), 107 deletions(-) create mode 100644 tests/expected_to_fail/1.po create mode 100644 tests/expected_to_fail/issue19.po create mode 100644 tests/expected_to_success/1.po create mode 100644 tests/expected_to_success/accronyms.po create mode 100644 tests/expected_to_success/hour.po create mode 100644 tests/expected_to_success/versions.po diff --git a/pospell.py b/pospell.py index a211c61..eba15f6 100644 --- a/pospell.py +++ b/pospell.py @@ -2,6 +2,8 @@ """ from collections import defaultdict import io +from string import digits +from unicodedata import category import logging import subprocess import sys @@ -108,9 +110,27 @@ def strip_rst(line): line = line[:-2] parser = docutils.parsers.rst.Parser() components = (docutils.parsers.rst.Parser,) - settings = docutils.frontend.OptionParser( - components=components - ).get_default_values() + settings = docutils.frontend.Values( + { + "report_level": 2, + "halt_level": 4, + "exit_status_level": 5, + "debug": None, + "warning_stream": None, + "error_encoding": "utf-8", + "error_encoding_error_handler": "backslashreplace", + "language_code": "en", + "id_prefix": "", + "auto_id_prefix": "id", + "pep_references": None, + "pep_base_url": "http://www.python.org/dev/peps/", + "pep_file_url_template": "pep-%04d", + "rfc_references": None, + "rfc_base_url": "http://tools.ietf.org/html/", + "tab_width": 8, + "trim_footnote_reference_space": None, + } + ) stderr_stringio = io.StringIO() with redirect_stderr(stderr_stringio): document = new_document("", settings=settings) @@ -130,15 +150,7 @@ def clear(line, drop_capitalized=False, po_path=""): to_drop = { r'', - # Strip accronyms - r"\b[\w-]*\p{Uppercase}{2,}[0-9.\w-]*\b", - r"---?", # -- and --- separators to be ignored - r" - ", # Drop lone dashes (sometimes used in place of -- or ---) - r"-\\ ", # Ignore "MINUS BACKSLASH SPACE" typically used in - # formulas, like '-\ *π*' but *π* gets removed too r"{[a-z_]*?}", # Sphinx variable - r"'?-?\b([0-9]+\.)*[0-9]+\.[0-9abcrx]+\b'?", # Versions - r"[0-9]+h", # Hours r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable r"« . »", # Single letter examples (typically in Unicode documentation) } @@ -245,10 +257,25 @@ def parse_args(): return args +def look_like_a_word(word): + """Used to filter out non-words like `---` or `-0700` so they don't + get reported. They typically are not errors. + """ + if not word: + return False + if any(digit in word for digit in digits): + return False + if len([c for c in word if category(c) == "Lu"]) > 1: + return False # Probably an accronym, or a name like CPython, macOS, SQLite, ... + if "-" in word: + return False + return True + + def spell_check( po_files, personal_dict=None, - language="en_EN", + language="en_US", drop_capitalized=False, debug_only=False, ): @@ -260,24 +287,27 @@ def spell_check( """ errors = [] personal_dict_arg = ["-p", personal_dict] if personal_dict else [] + texts_for_hunspell = {} for po_file in po_files: if debug_only: print(po_to_text(str(po_file), drop_capitalized)) continue - text_for_hunspell = po_to_text(str(po_file), drop_capitalized) - try: - output = subprocess.run( - ["hunspell", "-d", language, "-l"] + personal_dict_arg, - universal_newlines=True, - input=text_for_hunspell, - stdout=subprocess.PIPE, - ) - except subprocess.CalledProcessError: - return -1 - if not output.stdout: - continue # No errors :) - line_of_words = defaultdict(set) - for misspelled_word in {word for word in output.stdout.split("\n") if word}: + texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized) + try: + output = subprocess.run( + ["hunspell", "-d", language, "-l"] + personal_dict_arg, + universal_newlines=True, + input="\n".join(texts_for_hunspell.values()), + stdout=subprocess.PIPE, + ) + except subprocess.CalledProcessError: + return -1 + if not output.stdout: + return 0 + for misspelled_word in { + word for word in output.stdout.split("\n") if look_like_a_word(word) + }: + for po_file, text_for_hunspell in texts_for_hunspell.items(): for line_number, line in enumerate(text_for_hunspell.split("\n"), start=1): if misspelled_word in line: errors.append((po_file, line_number, misspelled_word)) diff --git a/tests/expected_to_fail/1.po b/tests/expected_to_fail/1.po new file mode 100644 index 0000000..177ddea --- /dev/null +++ b/tests/expected_to_fail/1.po @@ -0,0 +1,2 @@ +msgid "Python doc is translated!" +msgstr "Python doc is tranlsated!" diff --git a/tests/expected_to_fail/issue19.po b/tests/expected_to_fail/issue19.po new file mode 100644 index 0000000..f5be5b1 --- /dev/null +++ b/tests/expected_to_fail/issue19.po @@ -0,0 +1,17 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR Free Software Foundation, Inc. +# FIRST AUTHOR , YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"PO-Revision-Date: 2020-10-11 22:08+0200\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=CHARSET\n" +"Content-Transfer-Encoding: 8bit\n" + +msgid "pubb/subb yo" +msgstr "pubb/subb" diff --git a/tests/expected_to_success/1.po b/tests/expected_to_success/1.po new file mode 100644 index 0000000..2898db6 --- /dev/null +++ b/tests/expected_to_success/1.po @@ -0,0 +1,2 @@ +msgid "Python doc is translated!" +msgstr "Python doc is translated!" diff --git a/tests/expected_to_success/accronyms.po b/tests/expected_to_success/accronyms.po new file mode 100644 index 0000000..107ac75 --- /dev/null +++ b/tests/expected_to_success/accronyms.po @@ -0,0 +1,14 @@ +msgid "HTTP is great." +msgstr "HTTP is great." + +msgid "POSIX.1 is great too." +msgstr "POSIX.1 is great too." + +msgid "non-HTTP" +msgstr "non-HTTP" + +msgid "HTTP" +msgstr "HTTP" + +msgid "PEPs" +msgstr "PEPS" diff --git a/tests/expected_to_success/hour.po b/tests/expected_to_success/hour.po new file mode 100644 index 0000000..05d68bb --- /dev/null +++ b/tests/expected_to_success/hour.po @@ -0,0 +1,2 @@ +msgid "Rendez-vous à 10h chez Murex" +msgstr "See your at 10h at Murex" diff --git a/tests/expected_to_success/versions.po b/tests/expected_to_success/versions.po new file mode 100644 index 0000000..40af8a4 --- /dev/null +++ b/tests/expected_to_success/versions.po @@ -0,0 +1,2 @@ +msgid "under python 1.6a1, 1.5.2, and earlier." +msgstr "under python 1.6a1, 1.5.2, and earlier." diff --git a/tests/test_pospell.py b/tests/test_pospell.py index 592c3d2..8c119a5 100644 --- a/tests/test_pospell.py +++ b/tests/test_pospell.py @@ -1,6 +1,9 @@ +import os from types import SimpleNamespace from pathlib import Path +import pytest + from pospell import clear, strip_rst, spell_check @@ -28,9 +31,6 @@ def test_clear(): # We remove soft hyphens assert clear("some\xadthing") == "something" - # We drop hours because hunspell whines on them - assert "10h" not in clear("Rendez-vous à 10h chez Murex") - # When we removed a dashed name, remove it all assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait" assert "Marc-André" in clear("Marc-André Lemburg a fait", True) @@ -46,10 +46,6 @@ def test_clear(): # We remove variables assert "days_since" not in clear("Starting {days_since} days ago") - # Drop PEP 440 versions - assert "1.6a1" not in clear("under python 1.6a1, 1.5.2, and earlier.") - assert "1.5.2" not in clear("under python 1.6a1, 1.5.2, and earlier.") - # Double space should change nothing assert clear("Test. Aujourd'hui, j'ai faim.") == clear( "Test. Aujourd'hui, j'ai faim." @@ -58,81 +54,16 @@ def test_clear(): assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers")) -def test_clear_accronyms(): - for drop_capitalized in True, False: - # We always drop accronyms - assert "HTTP" not in clear("HTTP is great.", drop_capitalized) - - # Even suffixed with a number - assert "POSIX.1" not in clear("POSIX.1 is great.", drop_capitalized) - - # Correctly drop prefix of accronyms - assert "non-HTTP" not in clear("non-HTTP is bad.", drop_capitalized) - - # Also skip accronyms in the middle of a sentence - assert "HTTP" not in clear("Yes HTTP is great.", drop_capitalized) - - assert "PEPs" not in clear("Ho. PEPs good.", drop_capitalized) +FIXTURE_DIR = Path(__file__).resolve().parent -def test_with_an_error(tmp_path, capsys, monkeypatch): - import subprocess - - tmp_path = Path(tmp_path) - monkeypatch.setattr( - subprocess, - "run", - lambda *args, **kwargs: SimpleNamespace(stdout="Pyhton\n"), - ) - (tmp_path / "test.po").write_text( - """ -msgid "Python FTW!" -msgstr "Gloire à Pyhton !" -""" - ) - assert spell_check([tmp_path / "test.po"]) > 0 - captured = capsys.readouterr() - assert "Pyhton" in captured.out - assert not captured.err +@pytest.mark.parametrize("po_file", (FIXTURE_DIR / "expected_to_fail").glob("*.po")) +def test_expected_to_fail(po_file, capsys): + assert spell_check([po_file]) > 0 + assert not capsys.readouterr().err -def test_with_no_error(tmp_path, capsys, monkeypatch): - import subprocess - - tmp_path = Path(tmp_path) - monkeypatch.setattr( - subprocess, - "run", - lambda *args, **kwargs: SimpleNamespace(stdout=""), - ) - (tmp_path / "test.po").write_text( - """ -msgid "Python FTW!" -msgstr "Gloire à Python !" -""" - ) - assert spell_check([tmp_path / "test.po"]) == 0 - captured = capsys.readouterr() - assert not captured.out - assert not captured.err - - -def test_issue_19(tmp_path, capsys, monkeypatch): - import subprocess - - tmp_path = Path(tmp_path) - monkeypatch.setattr( - subprocess, - "run", - lambda *args, **kwargs: SimpleNamespace(stdout="pubb\nsubb\n"), - ) - (tmp_path / "test.po").write_text( - """ -msgid "pubb/subb yo" -msgstr "pubb/subb" -""" - ) - assert spell_check([tmp_path / "test.po"]) > 0 - captured = capsys.readouterr() - assert "pubb" in captured.out - assert not captured.err +@pytest.mark.parametrize("po_file", (FIXTURE_DIR / "expected_to_success").glob("*.po")) +def test_expected_to_success(po_file, capsys): + assert spell_check([po_file]) == 0 + assert not capsys.readouterr().err