From 5db3a839cbaad9721f614015626e8e830e2f3367 Mon Sep 17 00:00:00 2001
From: Julien Palard <julien@palard.fr>
Date: Sun, 11 Oct 2020 23:00:30 +0200
Subject: [PATCH] Faster implementation (~twice faster on python-docs-fr).

---
 pospell.py                             | 82 ++++++++++++++++-------
 tests/expected_to_fail/1.po            |  2 +
 tests/expected_to_fail/issue19.po      | 17 +++++
 tests/expected_to_success/1.po         |  2 +
 tests/expected_to_success/accronyms.po | 14 ++++
 tests/expected_to_success/hour.po      |  2 +
 tests/expected_to_success/versions.po  |  2 +
 tests/test_pospell.py                  | 93 ++++----------------------
 8 files changed, 107 insertions(+), 107 deletions(-)
 create mode 100644 tests/expected_to_fail/1.po
 create mode 100644 tests/expected_to_fail/issue19.po
 create mode 100644 tests/expected_to_success/1.po
 create mode 100644 tests/expected_to_success/accronyms.po
 create mode 100644 tests/expected_to_success/hour.po
 create mode 100644 tests/expected_to_success/versions.po
diff --git a/pospell.py b/pospell.py
index a211c61..eba15f6 100644
--- a/pospell.py
+++ b/pospell.py
@@ -2,6 +2,8 @@
 """
 from collections import defaultdict
 import io
+from string import digits
+from unicodedata import category
 import logging
 import subprocess
 import sys
@@ -108,9 +110,27 @@ def strip_rst(line):
         line = line[:-2]
     parser = docutils.parsers.rst.Parser()
     components = (docutils.parsers.rst.Parser,)
-    settings = docutils.frontend.OptionParser(
-        components=components
-    ).get_default_values()
+    settings = docutils.frontend.Values(
+        {
+            "report_level": 2,
+            "halt_level": 4,
+            "exit_status_level": 5,
+            "debug": None,
+            "warning_stream": None,
+            "error_encoding": "utf-8",
+            "error_encoding_error_handler": "backslashreplace",
+            "language_code": "en",
+            "id_prefix": "",
+            "auto_id_prefix": "id",
+            "pep_references": None,
+            "pep_base_url": "http://www.python.org/dev/peps/",
+            "pep_file_url_template": "pep-%04d",
+            "rfc_references": None,
+            "rfc_base_url": "http://tools.ietf.org/html/",
+            "tab_width": 8,
+            "trim_footnote_reference_space": None,
+        }
+    )
     stderr_stringio = io.StringIO()
     with redirect_stderr(stderr_stringio):
         document = new_document("<rst-doc>", settings=settings)
@@ -130,15 +150,7 @@ def clear(line, drop_capitalized=False, po_path=""):
 
     to_drop = {
         r'<a href="[^"]*?">',
-        # Strip accronyms
-        r"\b[\w-]*\p{Uppercase}{2,}[0-9.\w-]*\b",
-        r"---?",  # -- and --- separators to be ignored
-        r" - ",  # Drop lone dashes (sometimes used in place of -- or ---)
-        r"-\\ ",  # Ignore "MINUS BACKSLASH SPACE" typically used in
-        # formulas, like '-\ *π*' but *π* gets removed too
         r"{[a-z_]*?}",  # Sphinx variable
-        r"'?-?\b([0-9]+\.)*[0-9]+\.[0-9abcrx]+\b'?",  # Versions
-        r"[0-9]+h",  # Hours
         r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]",  # Sphinx variable
         r"« . »",  # Single letter examples (typically in Unicode documentation)
     }
@@ -245,10 +257,25 @@ def parse_args():
     return args
 
 
+def look_like_a_word(word):
+    """Used to filter out non-words like `---` or `-0700` so they don't
+    get reported. They typically are not errors.
+    """
+    if not word:
+        return False
+    if any(digit in word for digit in digits):
+        return False
+    if len([c for c in word if category(c) == "Lu"]) > 1:
+        return False  # Probably an accronym, or a name like CPython, macOS, SQLite, ...
+    if "-" in word:
+        return False
+    return True
+
+
 def spell_check(
     po_files,
     personal_dict=None,
-    language="en_EN",
+    language="en_US",
     drop_capitalized=False,
     debug_only=False,
 ):
@@ -260,24 +287,27 @@ def spell_check(
     """
     errors = []
     personal_dict_arg = ["-p", personal_dict] if personal_dict else []
+    texts_for_hunspell = {}
     for po_file in po_files:
         if debug_only:
             print(po_to_text(str(po_file), drop_capitalized))
             continue
-        text_for_hunspell = po_to_text(str(po_file), drop_capitalized)
-        try:
-            output = subprocess.run(
-                ["hunspell", "-d", language, "-l"] + personal_dict_arg,
-                universal_newlines=True,
-                input=text_for_hunspell,
-                stdout=subprocess.PIPE,
-            )
-        except subprocess.CalledProcessError:
-            return -1
-        if not output.stdout:
-            continue  # No errors :)
-        line_of_words = defaultdict(set)
-        for misspelled_word in {word for word in output.stdout.split("\n") if word}:
+        texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized)
+    try:
+        output = subprocess.run(
+            ["hunspell", "-d", language, "-l"] + personal_dict_arg,
+            universal_newlines=True,
+            input="\n".join(texts_for_hunspell.values()),
+            stdout=subprocess.PIPE,
+        )
+    except subprocess.CalledProcessError:
+        return -1
+    if not output.stdout:
+        return 0
+    for misspelled_word in {
+        word for word in output.stdout.split("\n") if look_like_a_word(word)
+    }:
+        for po_file, text_for_hunspell in texts_for_hunspell.items():
             for line_number, line in enumerate(text_for_hunspell.split("\n"), start=1):
                 if misspelled_word in line:
                     errors.append((po_file, line_number, misspelled_word))
diff --git a/tests/expected_to_fail/1.po b/tests/expected_to_fail/1.po
new file mode 100644
index 0000000..177ddea
--- /dev/null
+++ b/tests/expected_to_fail/1.po
@@ -0,0 +1,2 @@
+msgid "Python doc is translated!"
+msgstr "Python doc is tranlsated!"
diff --git a/tests/expected_to_fail/issue19.po b/tests/expected_to_fail/issue19.po
new file mode 100644
index 0000000..f5be5b1
--- /dev/null
+++ b/tests/expected_to_fail/issue19.po
@@ -0,0 +1,17 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR Free Software Foundation, Inc.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"PO-Revision-Date: 2020-10-11 22:08+0200\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+msgid "pubb/subb yo"
+msgstr "pubb/subb"
diff --git a/tests/expected_to_success/1.po b/tests/expected_to_success/1.po
new file mode 100644
index 0000000..2898db6
--- /dev/null
+++ b/tests/expected_to_success/1.po
@@ -0,0 +1,2 @@
+msgid "Python doc is translated!"
+msgstr "Python doc is translated!"
diff --git a/tests/expected_to_success/accronyms.po b/tests/expected_to_success/accronyms.po
new file mode 100644
index 0000000..107ac75
--- /dev/null
+++ b/tests/expected_to_success/accronyms.po
@@ -0,0 +1,14 @@
+msgid "HTTP is great."
+msgstr "HTTP is great."
+
+msgid "POSIX.1 is great too."
+msgstr "POSIX.1 is great too."
+
+msgid "non-HTTP"
+msgstr "non-HTTP"
+
+msgid "HTTP"
+msgstr "HTTP"
+
+msgid "PEPs"
+msgstr "PEPS"
diff --git a/tests/expected_to_success/hour.po b/tests/expected_to_success/hour.po
new file mode 100644
index 0000000..05d68bb
--- /dev/null
+++ b/tests/expected_to_success/hour.po
@@ -0,0 +1,2 @@
+msgid "Rendez-vous à 10h chez Murex"
+msgstr "See your at 10h at Murex"
diff --git a/tests/expected_to_success/versions.po b/tests/expected_to_success/versions.po
new file mode 100644
index 0000000..40af8a4
--- /dev/null
+++ b/tests/expected_to_success/versions.po
@@ -0,0 +1,2 @@
+msgid "under python 1.6a1, 1.5.2, and earlier."
+msgstr "under python 1.6a1, 1.5.2, and earlier."
diff --git a/tests/test_pospell.py b/tests/test_pospell.py
index 592c3d2..8c119a5 100644
--- a/tests/test_pospell.py
+++ b/tests/test_pospell.py
@@ -1,6 +1,9 @@
+import os
 from types import SimpleNamespace
 from pathlib import Path
 
+import pytest
+
 from pospell import clear, strip_rst, spell_check
 
 
@@ -28,9 +31,6 @@ def test_clear():
     # We remove soft hyphens
     assert clear("some\xadthing") == "something"
 
-    # We drop hours because hunspell whines on them
-    assert "10h" not in clear("Rendez-vous à 10h chez Murex")
-
     # When we removed a dashed name, remove it all
     assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait"
     assert "Marc-André" in clear("Marc-André Lemburg a fait", True)
@@ -46,10 +46,6 @@ def test_clear():
     # We remove variables
     assert "days_since" not in clear("Starting {days_since} days ago")
 
-    # Drop PEP 440 versions
-    assert "1.6a1" not in clear("under python 1.6a1, 1.5.2, and earlier.")
-    assert "1.5.2" not in clear("under python 1.6a1, 1.5.2, and earlier.")
-
     # Double space should change nothing
     assert clear("Test. Aujourd'hui, j'ai faim.") == clear(
         "Test.  Aujourd'hui, j'ai faim."
@@ -58,81 +54,16 @@ def test_clear():
     assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers"))
 
 
-def test_clear_accronyms():
-    for drop_capitalized in True, False:
-        # We always drop accronyms
-        assert "HTTP" not in clear("HTTP is great.", drop_capitalized)
-
-        # Even suffixed with a number
-        assert "POSIX.1" not in clear("POSIX.1 is great.", drop_capitalized)
-
-        # Correctly drop prefix of accronyms
-        assert "non-HTTP" not in clear("non-HTTP is bad.", drop_capitalized)
-
-        # Also skip accronyms in the middle of a sentence
-        assert "HTTP" not in clear("Yes HTTP is great.", drop_capitalized)
-
-        assert "PEPs" not in clear("Ho. PEPs good.", drop_capitalized)
+FIXTURE_DIR = Path(__file__).resolve().parent
 
 
-def test_with_an_error(tmp_path, capsys, monkeypatch):
-    import subprocess
-
-    tmp_path = Path(tmp_path)
-    monkeypatch.setattr(
-        subprocess,
-        "run",
-        lambda *args, **kwargs: SimpleNamespace(stdout="Pyhton\n"),
-    )
-    (tmp_path / "test.po").write_text(
-        """
-msgid "Python FTW!"
-msgstr "Gloire à Pyhton !"
-"""
-    )
-    assert spell_check([tmp_path / "test.po"]) > 0
-    captured = capsys.readouterr()
-    assert "Pyhton" in captured.out
-    assert not captured.err
+@pytest.mark.parametrize("po_file", (FIXTURE_DIR / "expected_to_fail").glob("*.po"))
+def test_expected_to_fail(po_file, capsys):
+    assert spell_check([po_file]) > 0
+    assert not capsys.readouterr().err
 
 
-def test_with_no_error(tmp_path, capsys, monkeypatch):
-    import subprocess
-
-    tmp_path = Path(tmp_path)
-    monkeypatch.setattr(
-        subprocess,
-        "run",
-        lambda *args, **kwargs: SimpleNamespace(stdout=""),
-    )
-    (tmp_path / "test.po").write_text(
-        """
-msgid "Python FTW!"
-msgstr "Gloire à Python !"
-"""
-    )
-    assert spell_check([tmp_path / "test.po"]) == 0
-    captured = capsys.readouterr()
-    assert not captured.out
-    assert not captured.err
-
-
-def test_issue_19(tmp_path, capsys, monkeypatch):
-    import subprocess
-
-    tmp_path = Path(tmp_path)
-    monkeypatch.setattr(
-        subprocess,
-        "run",
-        lambda *args, **kwargs: SimpleNamespace(stdout="pubb\nsubb\n"),
-    )
-    (tmp_path / "test.po").write_text(
-        """
-msgid "pubb/subb yo"
-msgstr "pubb/subb"
-"""
-    )
-    assert spell_check([tmp_path / "test.po"]) > 0
-    captured = capsys.readouterr()
-    assert "pubb" in captured.out
-    assert not captured.err
+@pytest.mark.parametrize("po_file", (FIXTURE_DIR / "expected_to_success").glob("*.po"))
+def test_expected_to_success(po_file, capsys):
+    assert spell_check([po_file]) == 0
+    assert not capsys.readouterr().err