Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)

This commit is contained in:
Julien Palard 2020-07-01 17:35:13 +02:00 committed by GitHub
parent 9c93c50106
commit f90ac406af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 63 additions and 86 deletions

View File

@ -1,5 +1,6 @@
"""pospell is a spellcheckers for po files containing reStructuedText.
"""
from collections import defaultdict
import io
import logging
import subprocess
@ -23,6 +24,7 @@ __version__ = "1.0.4"
DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
try:
HUNSPELL_VERSION = subprocess.check_output(
["hunspell", "--version"], universal_newlines=True
@ -123,11 +125,12 @@ def strip_rst(line):
return str(visitor)
def clear(po_path, line, drop_capitalized=False):
def clear(line, drop_capitalized=False, po_path=""):
"""Clear various other syntaxes we may encounter in a line.
"""
# Normalize spaces
line = regex.sub(r"\s+", " ", line)
line = regex.sub(r"\s+", " ", line).replace("\xad", "")
to_drop = {
r'<a href="[^"]*?">',
# Strip accronyms
@ -141,7 +144,6 @@ def clear(po_path, line, drop_capitalized=False):
r"[0-9]+h", # Hours
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
r"« . »", # Single letter examples (typically in Unicode documentation)
"\xad", # soft hyphen
}
if drop_capitalized:
to_drop.add(
@ -175,7 +177,7 @@ def po_to_text(po_path, drop_capitalized=False):
while lines < entry.linenum:
buffer.append("")
lines += 1
buffer.append(clear(po_path, strip_rst(entry.msgstr), drop_capitalized))
buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
lines += 1
return "\n".join(buffer)
@ -256,35 +258,35 @@ def spell_check(
Debug only will show what's passed to Hunspell instead of passing it.
"""
errors = 0
errors = []
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdir = Path(tmpdirname)
for po_file in po_files:
if debug_only:
print(po_to_text(str(po_file), drop_capitalized))
continue
(tmpdir / po_file.name).write_text(
po_to_text(str(po_file), drop_capitalized)
for po_file in po_files:
if debug_only:
print(po_to_text(str(po_file), drop_capitalized))
continue
text_for_hunspell = po_to_text(str(po_file), drop_capitalized)
try:
output = subprocess.run(
["hunspell", "-d", language, "-l"] + personal_dict_arg,
universal_newlines=True,
input=text_for_hunspell,
stdout=subprocess.PIPE,
)
try:
output = subprocess.check_output(
["hunspell", "-d", language]
+ personal_dict_arg
+ ["-u3", str(tmpdir / po_file.name)],
universal_newlines=True,
)
except subprocess.CalledProcessError:
return -1
for line in output.split("\n"):
match = regex.match(
r"(?P<path>.*):(?P<line>[0-9]+): Locate: (?P<error>.*) \| Try: .*$",
line,
)
if match:
errors += 1
print(po_file, match.group("line"), match.group("error"), sep=":")
return errors
except subprocess.CalledProcessError:
return -1
if not output.stdout:
continue # No errors :)
line_of_words = defaultdict(set)
for line, text in enumerate(text_for_hunspell.split("\n"), start=1):
for word in text.split():
line_of_words[word].add(line)
for misspelled_word in set(output.stdout.split("\n")):
for line_number in line_of_words[misspelled_word]:
errors.append((po_file, line_number, misspelled_word))
errors.sort()
for error in errors:
print(":".join(str(token) for token in error))
return len(errors)
def gracefull_handling_of_missing_dicts(language):

View File

@ -3,95 +3,70 @@ from pospell import clear, strip_rst
def test_clear():
# We don't remove legitimally capitalized first words:
assert clear("test", "Sport is great.") == "Sport is great."
assert clear("test", "Sport is great.", drop_capitalized=True) == "Sport is great."
assert clear("Sport is great.") == "Sport is great."
assert clear("Sport is great.", True) == "Sport is great."
# Sometimes we can't guess it's a firstname:
assert clear("test", "Julien Palard teste.") == "Julien Palard teste."
assert (
clear("test", "Julien Palard teste.", drop_capitalized=True) == "Julien teste."
)
assert clear("Julien Palard teste.") == "Julien Palard teste."
assert "Palard" not in clear("Julien Palard teste.", True)
# We remove capitalized words in the middle of a sentence
# they are typically names
assert clear("test", "Great is Unicode.") == "Great is Unicode."
assert clear("test", "Great is Unicode.", drop_capitalized=True) == "Great is ."
assert clear("Great is Unicode.") == "Great is Unicode."
assert "Unicode" not in clear("Great is Unicode.", True)
# We remove capitalized words even prefixed with l' in french.
assert (
clear("test", "Bah si, l'Unicode c'est bien.")
== "Bah si, l'Unicode c'est bien."
)
assert (
clear("test", "Bah si, l'Unicode c'est bien.", drop_capitalized=True)
== "Bah si, c'est bien."
)
assert clear("Bah si, l'Unicode c'est bien.") == "Bah si, l'Unicode c'est bien."
assert "Unicode" not in clear("Bah si, l'Unicode c'est bien.", True)
# We remove single letters in quotes
assert clear("test", "La lettre « é » est seule.") == "La lettre est seule."
assert "é" not in clear("La lettre « é » est seule.")
# We remove soft hyphens
assert clear("test", "some\xadthing") == "something"
assert clear("some\xadthing") == "something"
# We drop hours because hunspell whines on them
assert clear("test", "Rendez-vous à 10h chez Murex") == "Rendez-vous à chez Murex"
assert "10h" not in clear("Rendez-vous à 10h chez Murex")
# When we removed a dashed name, remove it all
assert clear("test", "Marc-André Lemburg a fait") != "Marc- Lemburg a fait"
assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait"
assert "Marc-André" in clear("Marc-André Lemburg a fait", True)
assert "Lemburg" not in clear("Marc-André Lemburg a fait", True)
# Even in the middle of a sentence
assert (
clear("test", "Hier, Marc-André Lemburg a fait")
== "Hier, Marc-André Lemburg a fait"
)
assert clear("Hier, Marc-André Lemburg a fait") == "Hier, Marc-André Lemburg a fait"
assert "Marc-André" not in clear("Hier, Marc-André Lemburg a fait", True)
assert "André" not in clear("Hier, Marc-André Lemburg a fait", True)
assert "Marc" not in clear("Hier, Marc-André Lemburg a fait", True)
assert "Lemburg" not in clear("Hier, Marc-André Lemburg a fait", True)
# We remove variables
assert clear("test", "Starting {days_since} days ago") == "Starting days ago"
assert "days_since" not in clear("Starting {days_since} days ago")
# Drop PEP 440 versions
assert (
clear("test", "under python 1.6a1, 1.5.2, and earlier.")
== "under python , , and earlier."
)
assert "1.6a1" not in clear("under python 1.6a1, 1.5.2, and earlier.")
assert "1.5.2" not in clear("under python 1.6a1, 1.5.2, and earlier.")
# Double space should change nothing
assert clear("test", "Test. Aujourd'hui, j'ai faim.") == clear(
"test", "Test. Aujourd'hui, j'ai faim."
assert clear("Test. Aujourd'hui, j'ai faim.") == clear(
"Test. Aujourd'hui, j'ai faim."
)
assert (
clear("test", strip_rst(":pep:`305` - Interface des fichiers"))
== "Interface des fichiers"
)
assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers"))
def test_clear_accronyms():
for drop_capitalized in True, False:
# We always drop accronyms
assert (
clear("test", "HTTP is great.", drop_capitalized=drop_capitalized)
== " is great."
)
assert "HTTP" not in clear("HTTP is great.", drop_capitalized)
# Even suffixed with a number
assert (
clear("test", "POSIX.1 is great.", drop_capitalized=drop_capitalized)
== " is great."
)
assert "POSIX.1" not in clear("POSIX.1 is great.", drop_capitalized)
# Correctly drop prefix of accronyms
assert (
clear("test", "non-HTTP is bad.", drop_capitalized=drop_capitalized)
== " is bad."
)
assert "non-HTTP" not in clear("non-HTTP is bad.", drop_capitalized)
# Also skip accronyms in the middle of a sentence
assert (
clear("test", "Yes HTTP is great.", drop_capitalized=drop_capitalized)
== "Yes is great."
)
assert "HTTP" not in clear("Yes HTTP is great.", drop_capitalized)
assert (
clear("", "Ho. PEPs good.", drop_capitalized=drop_capitalized)
== "Ho. good."
)
assert "PEPs" not in clear("Ho. PEPs good.", drop_capitalized)