FIX: Regression in dropping plural accronyms like PEPs.

2019-10-16 17:27:03 +02:00 · 2019-10-16 17:27:03 +02:00 · 4a3abc4ad5
commit 4a3abc4ad5
parent 6d944a47fb
2 changed files with 4 additions and 1 deletions
--- a/pospell.py
+++ b/pospell.py
@ -130,7 +130,7 @@ def clear(po_path, line, drop_capitalized=False):
    to_drop = {
        r'<a href="[^"]*?">',
        # Strip accronyms
-        r"\b\p{Uppercase}{2,}\b",
+        r"\b\p{Uppercase}{2,}[\w-]*\b",
        r"---?",  # -- and --- separators to be ignored
        r" - ",  # Drop lone dashes (sometimes used in place of -- or ---)
        r"-\\ ",  # Ignore "MINUS BACKSLASH SPACE" typically used in
--- a/tests/test_pospell.py
+++ b/tests/test_pospell.py
@ -27,6 +27,9 @@ def test_clear_keep_capital():
    assert clear("test", "Great is Unicode.") == "Great is Unicode."
    assert clear("test", "Great is Unicode.", drop_capitalized=True) == "Great is ."

+    assert clear("", "Ho. PEPs good.") == "Ho.  good."
+    assert clear("", "Ho. PEPs good.", drop_capitalized=True) == "Ho.  good."
+
    # We remove capitalized words even prefixed with l' in french.
    assert (
        clear("test", "Bah si, l'Unicode c'est bien.")