FIX: Regression in dropping plural accronyms like PEPs.
This commit is contained in:
parent
6d944a47fb
commit
4a3abc4ad5
|
@ -130,7 +130,7 @@ def clear(po_path, line, drop_capitalized=False):
|
|||
to_drop = {
|
||||
r'<a href="[^"]*?">',
|
||||
# Strip accronyms
|
||||
r"\b\p{Uppercase}{2,}\b",
|
||||
r"\b\p{Uppercase}{2,}[\w-]*\b",
|
||||
r"---?", # -- and --- separators to be ignored
|
||||
r" - ", # Drop lone dashes (sometimes used in place of -- or ---)
|
||||
r"-\\ ", # Ignore "MINUS BACKSLASH SPACE" typically used in
|
||||
|
|
|
@ -27,6 +27,9 @@ def test_clear_keep_capital():
|
|||
assert clear("test", "Great is Unicode.") == "Great is Unicode."
|
||||
assert clear("test", "Great is Unicode.", drop_capitalized=True) == "Great is ."
|
||||
|
||||
assert clear("", "Ho. PEPs good.") == "Ho. good."
|
||||
assert clear("", "Ho. PEPs good.", drop_capitalized=True) == "Ho. good."
|
||||
|
||||
# We remove capitalized words even prefixed with l' in french.
|
||||
assert (
|
||||
clear("test", "Bah si, l'Unicode c'est bien.")
|
||||
|
|
Loading…
Reference in New Issue
Block a user