FIX: Regression in dropping plural accronyms like PEPs.

This commit is contained in:
Julien Palard 2019-10-16 17:27:03 +02:00
parent 6d944a47fb
commit 4a3abc4ad5
2 changed files with 4 additions and 1 deletions

View File

@ -130,7 +130,7 @@ def clear(po_path, line, drop_capitalized=False):
to_drop = {
r'<a href="[^"]*?">',
# Strip accronyms
r"\b\p{Uppercase}{2,}\b",
r"\b\p{Uppercase}{2,}[\w-]*\b",
r"---?", # -- and --- separators to be ignored
r" - ", # Drop lone dashes (sometimes used in place of -- or ---)
r"-\\ ", # Ignore "MINUS BACKSLASH SPACE" typically used in

View File

@ -27,6 +27,9 @@ def test_clear_keep_capital():
assert clear("test", "Great is Unicode.") == "Great is Unicode."
assert clear("test", "Great is Unicode.", drop_capitalized=True) == "Great is ."
assert clear("", "Ho. PEPs good.") == "Ho. good."
assert clear("", "Ho. PEPs good.", drop_capitalized=True) == "Ho. good."
# We remove capitalized words even prefixed with l' in french.
assert (
clear("test", "Bah si, l'Unicode c'est bien.")