parent
9c93c50106
commit
f90ac406af
64
pospell.py
64
pospell.py
|
@ -1,5 +1,6 @@
|
|||
"""pospell is a spellcheckers for po files containing reStructuedText.
|
||||
"""
|
||||
from collections import defaultdict
|
||||
import io
|
||||
import logging
|
||||
import subprocess
|
||||
|
@ -23,6 +24,7 @@ __version__ = "1.0.4"
|
|||
|
||||
DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
|
||||
|
||||
|
||||
try:
|
||||
HUNSPELL_VERSION = subprocess.check_output(
|
||||
["hunspell", "--version"], universal_newlines=True
|
||||
|
@ -123,11 +125,12 @@ def strip_rst(line):
|
|||
return str(visitor)
|
||||
|
||||
|
||||
def clear(po_path, line, drop_capitalized=False):
|
||||
def clear(line, drop_capitalized=False, po_path=""):
|
||||
"""Clear various other syntaxes we may encounter in a line.
|
||||
"""
|
||||
# Normalize spaces
|
||||
line = regex.sub(r"\s+", " ", line)
|
||||
line = regex.sub(r"\s+", " ", line).replace("\xad", "")
|
||||
|
||||
to_drop = {
|
||||
r'<a href="[^"]*?">',
|
||||
# Strip accronyms
|
||||
|
@ -141,7 +144,6 @@ def clear(po_path, line, drop_capitalized=False):
|
|||
r"[0-9]+h", # Hours
|
||||
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
|
||||
r"« . »", # Single letter examples (typically in Unicode documentation)
|
||||
"\xad", # soft hyphen
|
||||
}
|
||||
if drop_capitalized:
|
||||
to_drop.add(
|
||||
|
@ -175,7 +177,7 @@ def po_to_text(po_path, drop_capitalized=False):
|
|||
while lines < entry.linenum:
|
||||
buffer.append("")
|
||||
lines += 1
|
||||
buffer.append(clear(po_path, strip_rst(entry.msgstr), drop_capitalized))
|
||||
buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
|
||||
lines += 1
|
||||
return "\n".join(buffer)
|
||||
|
||||
|
@ -256,35 +258,35 @@ def spell_check(
|
|||
|
||||
Debug only will show what's passed to Hunspell instead of passing it.
|
||||
"""
|
||||
errors = 0
|
||||
errors = []
|
||||
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
tmpdir = Path(tmpdirname)
|
||||
for po_file in po_files:
|
||||
if debug_only:
|
||||
print(po_to_text(str(po_file), drop_capitalized))
|
||||
continue
|
||||
(tmpdir / po_file.name).write_text(
|
||||
po_to_text(str(po_file), drop_capitalized)
|
||||
for po_file in po_files:
|
||||
if debug_only:
|
||||
print(po_to_text(str(po_file), drop_capitalized))
|
||||
continue
|
||||
text_for_hunspell = po_to_text(str(po_file), drop_capitalized)
|
||||
try:
|
||||
output = subprocess.run(
|
||||
["hunspell", "-d", language, "-l"] + personal_dict_arg,
|
||||
universal_newlines=True,
|
||||
input=text_for_hunspell,
|
||||
stdout=subprocess.PIPE,
|
||||
)
|
||||
try:
|
||||
output = subprocess.check_output(
|
||||
["hunspell", "-d", language]
|
||||
+ personal_dict_arg
|
||||
+ ["-u3", str(tmpdir / po_file.name)],
|
||||
universal_newlines=True,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
return -1
|
||||
for line in output.split("\n"):
|
||||
match = regex.match(
|
||||
r"(?P<path>.*):(?P<line>[0-9]+): Locate: (?P<error>.*) \| Try: .*$",
|
||||
line,
|
||||
)
|
||||
if match:
|
||||
errors += 1
|
||||
print(po_file, match.group("line"), match.group("error"), sep=":")
|
||||
return errors
|
||||
except subprocess.CalledProcessError:
|
||||
return -1
|
||||
if not output.stdout:
|
||||
continue # No errors :)
|
||||
line_of_words = defaultdict(set)
|
||||
for line, text in enumerate(text_for_hunspell.split("\n"), start=1):
|
||||
for word in text.split():
|
||||
line_of_words[word].add(line)
|
||||
for misspelled_word in set(output.stdout.split("\n")):
|
||||
for line_number in line_of_words[misspelled_word]:
|
||||
errors.append((po_file, line_number, misspelled_word))
|
||||
errors.sort()
|
||||
for error in errors:
|
||||
print(":".join(str(token) for token in error))
|
||||
return len(errors)
|
||||
|
||||
|
||||
def gracefull_handling_of_missing_dicts(language):
|
||||
|
|
|
@ -3,95 +3,70 @@ from pospell import clear, strip_rst
|
|||
|
||||
def test_clear():
|
||||
# We don't remove legitimally capitalized first words:
|
||||
assert clear("test", "Sport is great.") == "Sport is great."
|
||||
assert clear("test", "Sport is great.", drop_capitalized=True) == "Sport is great."
|
||||
assert clear("Sport is great.") == "Sport is great."
|
||||
assert clear("Sport is great.", True) == "Sport is great."
|
||||
|
||||
# Sometimes we can't guess it's a firstname:
|
||||
assert clear("test", "Julien Palard teste.") == "Julien Palard teste."
|
||||
assert (
|
||||
clear("test", "Julien Palard teste.", drop_capitalized=True) == "Julien teste."
|
||||
)
|
||||
assert clear("Julien Palard teste.") == "Julien Palard teste."
|
||||
assert "Palard" not in clear("Julien Palard teste.", True)
|
||||
|
||||
# We remove capitalized words in the middle of a sentence
|
||||
# they are typically names
|
||||
assert clear("test", "Great is Unicode.") == "Great is Unicode."
|
||||
assert clear("test", "Great is Unicode.", drop_capitalized=True) == "Great is ."
|
||||
assert clear("Great is Unicode.") == "Great is Unicode."
|
||||
assert "Unicode" not in clear("Great is Unicode.", True)
|
||||
|
||||
# We remove capitalized words even prefixed with l' in french.
|
||||
assert (
|
||||
clear("test", "Bah si, l'Unicode c'est bien.")
|
||||
== "Bah si, l'Unicode c'est bien."
|
||||
)
|
||||
assert (
|
||||
clear("test", "Bah si, l'Unicode c'est bien.", drop_capitalized=True)
|
||||
== "Bah si, c'est bien."
|
||||
)
|
||||
assert clear("Bah si, l'Unicode c'est bien.") == "Bah si, l'Unicode c'est bien."
|
||||
assert "Unicode" not in clear("Bah si, l'Unicode c'est bien.", True)
|
||||
|
||||
# We remove single letters in quotes
|
||||
assert clear("test", "La lettre « é » est seule.") == "La lettre est seule."
|
||||
assert "é" not in clear("La lettre « é » est seule.")
|
||||
|
||||
# We remove soft hyphens
|
||||
assert clear("test", "some\xadthing") == "something"
|
||||
assert clear("some\xadthing") == "something"
|
||||
|
||||
# We drop hours because hunspell whines on them
|
||||
assert clear("test", "Rendez-vous à 10h chez Murex") == "Rendez-vous à chez Murex"
|
||||
assert "10h" not in clear("Rendez-vous à 10h chez Murex")
|
||||
|
||||
# When we removed a dashed name, remove it all
|
||||
assert clear("test", "Marc-André Lemburg a fait") != "Marc- Lemburg a fait"
|
||||
assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait"
|
||||
assert "Marc-André" in clear("Marc-André Lemburg a fait", True)
|
||||
assert "Lemburg" not in clear("Marc-André Lemburg a fait", True)
|
||||
|
||||
# Even in the middle of a sentence
|
||||
assert (
|
||||
clear("test", "Hier, Marc-André Lemburg a fait")
|
||||
== "Hier, Marc-André Lemburg a fait"
|
||||
)
|
||||
assert clear("Hier, Marc-André Lemburg a fait") == "Hier, Marc-André Lemburg a fait"
|
||||
assert "Marc-André" not in clear("Hier, Marc-André Lemburg a fait", True)
|
||||
assert "André" not in clear("Hier, Marc-André Lemburg a fait", True)
|
||||
assert "Marc" not in clear("Hier, Marc-André Lemburg a fait", True)
|
||||
assert "Lemburg" not in clear("Hier, Marc-André Lemburg a fait", True)
|
||||
|
||||
# We remove variables
|
||||
assert clear("test", "Starting {days_since} days ago") == "Starting days ago"
|
||||
assert "days_since" not in clear("Starting {days_since} days ago")
|
||||
|
||||
# Drop PEP 440 versions
|
||||
assert (
|
||||
clear("test", "under python 1.6a1, 1.5.2, and earlier.")
|
||||
== "under python , , and earlier."
|
||||
)
|
||||
assert "1.6a1" not in clear("under python 1.6a1, 1.5.2, and earlier.")
|
||||
assert "1.5.2" not in clear("under python 1.6a1, 1.5.2, and earlier.")
|
||||
|
||||
# Double space should change nothing
|
||||
assert clear("test", "Test. Aujourd'hui, j'ai faim.") == clear(
|
||||
"test", "Test. Aujourd'hui, j'ai faim."
|
||||
assert clear("Test. Aujourd'hui, j'ai faim.") == clear(
|
||||
"Test. Aujourd'hui, j'ai faim."
|
||||
)
|
||||
|
||||
assert (
|
||||
clear("test", strip_rst(":pep:`305` - Interface des fichiers"))
|
||||
== "Interface des fichiers"
|
||||
)
|
||||
assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers"))
|
||||
|
||||
|
||||
def test_clear_accronyms():
|
||||
for drop_capitalized in True, False:
|
||||
# We always drop accronyms
|
||||
assert (
|
||||
clear("test", "HTTP is great.", drop_capitalized=drop_capitalized)
|
||||
== " is great."
|
||||
)
|
||||
assert "HTTP" not in clear("HTTP is great.", drop_capitalized)
|
||||
|
||||
# Even suffixed with a number
|
||||
assert (
|
||||
clear("test", "POSIX.1 is great.", drop_capitalized=drop_capitalized)
|
||||
== " is great."
|
||||
)
|
||||
assert "POSIX.1" not in clear("POSIX.1 is great.", drop_capitalized)
|
||||
|
||||
# Correctly drop prefix of accronyms
|
||||
assert (
|
||||
clear("test", "non-HTTP is bad.", drop_capitalized=drop_capitalized)
|
||||
== " is bad."
|
||||
)
|
||||
assert "non-HTTP" not in clear("non-HTTP is bad.", drop_capitalized)
|
||||
|
||||
# Also skip accronyms in the middle of a sentence
|
||||
assert (
|
||||
clear("test", "Yes HTTP is great.", drop_capitalized=drop_capitalized)
|
||||
== "Yes is great."
|
||||
)
|
||||
assert "HTTP" not in clear("Yes HTTP is great.", drop_capitalized)
|
||||
|
||||
assert (
|
||||
clear("", "Ho. PEPs good.", drop_capitalized=drop_capitalized)
|
||||
== "Ho. good."
|
||||
)
|
||||
assert "PEPs" not in clear("Ho. PEPs good.", drop_capitalized)
|
||||
|
|
Loading…
Reference in New Issue