Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)

2020-07-01 17:35:13 +02:00 · 2020-07-01 17:35:13 +02:00 · f90ac406af
parent 9c93c50106
commit f90ac406af
2 changed files with 63 additions and 86 deletions
--- a/pospell.py
+++ b/pospell.py
@ -1,5 +1,6 @@
 """pospell is a spellcheckers for po files containing reStructuedText.
 """
+from collections import defaultdict
 import io
 import logging
 import subprocess
@ -23,6 +24,7 @@ __version__ = "1.0.4"

 DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}

+
 try:
    HUNSPELL_VERSION = subprocess.check_output(
        ["hunspell", "--version"], universal_newlines=True
@ -123,11 +125,12 @@ def strip_rst(line):
    return str(visitor)


-def clear(po_path, line, drop_capitalized=False):
+def clear(line, drop_capitalized=False, po_path=""):
    """Clear various other syntaxes we may encounter in a line.
    """
    # Normalize spaces
-    line = regex.sub(r"\s+", " ", line)
+    line = regex.sub(r"\s+", " ", line).replace("\xad", "")
+
    to_drop = {
        r'<a href="[^"]*?">',
        # Strip accronyms
@ -141,7 +144,6 @@ def clear(po_path, line, drop_capitalized=False):
        r"[0-9]+h",  # Hours
        r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]",  # Sphinx variable
        r"« . »",  # Single letter examples (typically in Unicode documentation)
-        "\xad",  # soft hyphen
    }
    if drop_capitalized:
        to_drop.add(
@ -175,7 +177,7 @@ def po_to_text(po_path, drop_capitalized=False):
        while lines < entry.linenum:
            buffer.append("")
            lines += 1
-        buffer.append(clear(po_path, strip_rst(entry.msgstr), drop_capitalized))
+        buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
        lines += 1
    return "\n".join(buffer)

@ -256,35 +258,35 @@ def spell_check(

    Debug only will show what's passed to Hunspell instead of passing it.
    """
-    errors = 0
+    errors = []
    personal_dict_arg = ["-p", personal_dict] if personal_dict else []
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        tmpdir = Path(tmpdirname)
-        for po_file in po_files:
-            if debug_only:
-                print(po_to_text(str(po_file), drop_capitalized))
-                continue
-            (tmpdir / po_file.name).write_text(
-                po_to_text(str(po_file), drop_capitalized)
+    for po_file in po_files:
+        if debug_only:
+            print(po_to_text(str(po_file), drop_capitalized))
+            continue
+        text_for_hunspell = po_to_text(str(po_file), drop_capitalized)
+        try:
+            output = subprocess.run(
+                ["hunspell", "-d", language, "-l"] + personal_dict_arg,
+                universal_newlines=True,
+                input=text_for_hunspell,
+                stdout=subprocess.PIPE,
            )
-            try:
-                output = subprocess.check_output(
-                    ["hunspell", "-d", language]
-                    + personal_dict_arg
-                    + ["-u3", str(tmpdir / po_file.name)],
-                    universal_newlines=True,
-                )
-            except subprocess.CalledProcessError:
-                return -1
-            for line in output.split("\n"):
-                match = regex.match(
-                    r"(?P<path>.*):(?P<line>[0-9]+): Locate: (?P<error>.*) \| Try: .*$",
-                    line,
-                )
-                if match:
-                    errors += 1
-                    print(po_file, match.group("line"), match.group("error"), sep=":")
-    return errors
+        except subprocess.CalledProcessError:
+            return -1
+        if not output.stdout:
+            continue  # No errors :)
+        line_of_words = defaultdict(set)
+        for line, text in enumerate(text_for_hunspell.split("\n"), start=1):
+            for word in text.split():
+                line_of_words[word].add(line)
+        for misspelled_word in set(output.stdout.split("\n")):
+            for line_number in line_of_words[misspelled_word]:
+                errors.append((po_file, line_number, misspelled_word))
+    errors.sort()
+    for error in errors:
+        print(":".join(str(token) for token in error))
+    return len(errors)


 def gracefull_handling_of_missing_dicts(language):
--- a/tests/test_pospell.py
+++ b/tests/test_pospell.py
@ -3,95 +3,70 @@ from pospell import clear, strip_rst

 def test_clear():
    # We don't remove legitimally capitalized first words:
-    assert clear("test", "Sport is great.") == "Sport is great."
-    assert clear("test", "Sport is great.", drop_capitalized=True) == "Sport is great."
+    assert clear("Sport is great.") == "Sport is great."
+    assert clear("Sport is great.", True) == "Sport is great."

    # Sometimes we can't guess it's a firstname:
-    assert clear("test", "Julien Palard teste.") == "Julien Palard teste."
-    assert (
-        clear("test", "Julien Palard teste.", drop_capitalized=True) == "Julien  teste."
-    )
+    assert clear("Julien Palard teste.") == "Julien Palard teste."
+    assert "Palard" not in clear("Julien Palard teste.", True)

    # We remove capitalized words in the middle of a sentence
    # they are typically names
-    assert clear("test", "Great is Unicode.") == "Great is Unicode."
-    assert clear("test", "Great is Unicode.", drop_capitalized=True) == "Great is ."
+    assert clear("Great is Unicode.") == "Great is Unicode."
+    assert "Unicode" not in clear("Great is Unicode.", True)

    # We remove capitalized words even prefixed with l' in french.
-    assert (
-        clear("test", "Bah si, l'Unicode c'est bien.")
-        == "Bah si, l'Unicode c'est bien."
-    )
-    assert (
-        clear("test", "Bah si, l'Unicode c'est bien.", drop_capitalized=True)
-        == "Bah si,  c'est bien."
-    )
+    assert clear("Bah si, l'Unicode c'est bien.") == "Bah si, l'Unicode c'est bien."
+    assert "Unicode" not in clear("Bah si, l'Unicode c'est bien.", True)

    # We remove single letters in quotes
-    assert clear("test", "La lettre « é » est seule.") == "La lettre  est seule."
+    assert "é" not in clear("La lettre « é » est seule.")

    # We remove soft hyphens
-    assert clear("test", "some\xadthing") == "something"
+    assert clear("some\xadthing") == "something"

    # We drop hours because hunspell whines on them
-    assert clear("test", "Rendez-vous à 10h chez Murex") == "Rendez-vous à  chez Murex"
+    assert "10h" not in clear("Rendez-vous à 10h chez Murex")

    # When we removed a dashed name, remove it all
-    assert clear("test", "Marc-André Lemburg a fait") != "Marc- Lemburg a fait"
+    assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait"
+    assert "Marc-André" in clear("Marc-André Lemburg a fait", True)
+    assert "Lemburg" not in clear("Marc-André Lemburg a fait", True)

    # Even in the middle of a sentence
-    assert (
-        clear("test", "Hier, Marc-André Lemburg a fait")
-        == "Hier, Marc-André Lemburg a fait"
-    )
+    assert clear("Hier, Marc-André Lemburg a fait") == "Hier, Marc-André Lemburg a fait"
+    assert "Marc-André" not in clear("Hier, Marc-André Lemburg a fait", True)
+    assert "André" not in clear("Hier, Marc-André Lemburg a fait", True)
+    assert "Marc" not in clear("Hier, Marc-André Lemburg a fait", True)
+    assert "Lemburg" not in clear("Hier, Marc-André Lemburg a fait", True)

    # We remove variables
-    assert clear("test", "Starting {days_since} days ago") == "Starting  days ago"
+    assert "days_since" not in clear("Starting {days_since} days ago")

    # Drop PEP 440 versions
-    assert (
-        clear("test", "under python 1.6a1, 1.5.2, and earlier.")
-        == "under python , , and earlier."
-    )
+    assert "1.6a1" not in clear("under python 1.6a1, 1.5.2, and earlier.")
+    assert "1.5.2" not in clear("under python 1.6a1, 1.5.2, and earlier.")

    # Double space should change nothing
-    assert clear("test", "Test. Aujourd'hui, j'ai faim.") == clear(
-        "test", "Test.  Aujourd'hui, j'ai faim."
+    assert clear("Test. Aujourd'hui, j'ai faim.") == clear(
+        "Test.  Aujourd'hui, j'ai faim."
    )

-    assert (
-        clear("test", strip_rst(":pep:`305` - Interface des fichiers"))
-        == "Interface des fichiers"
-    )
+    assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers"))


 def test_clear_accronyms():
    for drop_capitalized in True, False:
        # We always drop accronyms
-        assert (
-            clear("test", "HTTP is great.", drop_capitalized=drop_capitalized)
-            == " is great."
-        )
+        assert "HTTP" not in clear("HTTP is great.", drop_capitalized)

        # Even suffixed with a number
-        assert (
-            clear("test", "POSIX.1 is great.", drop_capitalized=drop_capitalized)
-            == " is great."
-        )
+        assert "POSIX.1" not in clear("POSIX.1 is great.", drop_capitalized)

        # Correctly drop prefix of accronyms
-        assert (
-            clear("test", "non-HTTP is bad.", drop_capitalized=drop_capitalized)
-            == " is bad."
-        )
+        assert "non-HTTP" not in clear("non-HTTP is bad.", drop_capitalized)

        # Also skip accronyms in the middle of a sentence
-        assert (
-            clear("test", "Yes HTTP is great.", drop_capitalized=drop_capitalized)
-            == "Yes  is great."
-        )
+        assert "HTTP" not in clear("Yes HTTP is great.", drop_capitalized)

-        assert (
-            clear("", "Ho. PEPs good.", drop_capitalized=drop_capitalized)
-            == "Ho.  good."
-        )
+        assert "PEPs" not in clear("Ho. PEPs good.", drop_capitalized)