pospell/tests/test_pospell.py

73 lines
2.9 KiB
Python

from pospell import clear, strip_rst
def test_clear():
# We don't remove legitimally capitalized first words:
assert clear("Sport is great.") == "Sport is great."
assert clear("Sport is great.", True) == "Sport is great."
# Sometimes we can't guess it's a firstname:
assert clear("Julien Palard teste.") == "Julien Palard teste."
assert "Palard" not in clear("Julien Palard teste.", True)
# We remove capitalized words in the middle of a sentence
# they are typically names
assert clear("Great is Unicode.") == "Great is Unicode."
assert "Unicode" not in clear("Great is Unicode.", True)
# We remove capitalized words even prefixed with l' in french.
assert clear("Bah si, l'Unicode c'est bien.") == "Bah si, l'Unicode c'est bien."
assert "Unicode" not in clear("Bah si, l'Unicode c'est bien.", True)
# We remove single letters in quotes
assert "é" not in clear("La lettre « é » est seule.")
# We remove soft hyphens
assert clear("some\xadthing") == "something"
# We drop hours because hunspell whines on them
assert "10h" not in clear("Rendez-vous à 10h chez Murex")
# When we removed a dashed name, remove it all
assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait"
assert "Marc-André" in clear("Marc-André Lemburg a fait", True)
assert "Lemburg" not in clear("Marc-André Lemburg a fait", True)
# Even in the middle of a sentence
assert clear("Hier, Marc-André Lemburg a fait") == "Hier, Marc-André Lemburg a fait"
assert "Marc-André" not in clear("Hier, Marc-André Lemburg a fait", True)
assert "André" not in clear("Hier, Marc-André Lemburg a fait", True)
assert "Marc" not in clear("Hier, Marc-André Lemburg a fait", True)
assert "Lemburg" not in clear("Hier, Marc-André Lemburg a fait", True)
# We remove variables
assert "days_since" not in clear("Starting {days_since} days ago")
# Drop PEP 440 versions
assert "1.6a1" not in clear("under python 1.6a1, 1.5.2, and earlier.")
assert "1.5.2" not in clear("under python 1.6a1, 1.5.2, and earlier.")
# Double space should change nothing
assert clear("Test. Aujourd'hui, j'ai faim.") == clear(
"Test. Aujourd'hui, j'ai faim."
)
assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers"))
def test_clear_accronyms():
for drop_capitalized in True, False:
# We always drop accronyms
assert "HTTP" not in clear("HTTP is great.", drop_capitalized)
# Even suffixed with a number
assert "POSIX.1" not in clear("POSIX.1 is great.", drop_capitalized)
# Correctly drop prefix of accronyms
assert "non-HTTP" not in clear("non-HTTP is bad.", drop_capitalized)
# Also skip accronyms in the middle of a sentence
assert "HTTP" not in clear("Yes HTTP is great.", drop_capitalized)
assert "PEPs" not in clear("Ho. PEPs good.", drop_capitalized)