Use hunspell -l instead of hunspell -u3. Fixes #12 (#16)

This commit is contained in:
Julien Palard 2020-07-01 17:35:13 +02:00 committed by GitHub
parent 9c93c50106
commit f90ac406af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 63 additions and 86 deletions

View File

@ -1,5 +1,6 @@
"""pospell is a spellcheckers for po files containing reStructuedText. """pospell is a spellcheckers for po files containing reStructuedText.
""" """
from collections import defaultdict
import io import io
import logging import logging
import subprocess import subprocess
@ -23,6 +24,7 @@ __version__ = "1.0.4"
DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True} DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
try: try:
HUNSPELL_VERSION = subprocess.check_output( HUNSPELL_VERSION = subprocess.check_output(
["hunspell", "--version"], universal_newlines=True ["hunspell", "--version"], universal_newlines=True
@ -123,11 +125,12 @@ def strip_rst(line):
return str(visitor) return str(visitor)
def clear(po_path, line, drop_capitalized=False): def clear(line, drop_capitalized=False, po_path=""):
"""Clear various other syntaxes we may encounter in a line. """Clear various other syntaxes we may encounter in a line.
""" """
# Normalize spaces # Normalize spaces
line = regex.sub(r"\s+", " ", line) line = regex.sub(r"\s+", " ", line).replace("\xad", "")
to_drop = { to_drop = {
r'<a href="[^"]*?">', r'<a href="[^"]*?">',
# Strip accronyms # Strip accronyms
@ -141,7 +144,6 @@ def clear(po_path, line, drop_capitalized=False):
r"[0-9]+h", # Hours r"[0-9]+h", # Hours
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
r"« . »", # Single letter examples (typically in Unicode documentation) r"« . »", # Single letter examples (typically in Unicode documentation)
"\xad", # soft hyphen
} }
if drop_capitalized: if drop_capitalized:
to_drop.add( to_drop.add(
@ -175,7 +177,7 @@ def po_to_text(po_path, drop_capitalized=False):
while lines < entry.linenum: while lines < entry.linenum:
buffer.append("") buffer.append("")
lines += 1 lines += 1
buffer.append(clear(po_path, strip_rst(entry.msgstr), drop_capitalized)) buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
lines += 1 lines += 1
return "\n".join(buffer) return "\n".join(buffer)
@ -256,35 +258,35 @@ def spell_check(
Debug only will show what's passed to Hunspell instead of passing it. Debug only will show what's passed to Hunspell instead of passing it.
""" """
errors = 0 errors = []
personal_dict_arg = ["-p", personal_dict] if personal_dict else [] personal_dict_arg = ["-p", personal_dict] if personal_dict else []
with tempfile.TemporaryDirectory() as tmpdirname: for po_file in po_files:
tmpdir = Path(tmpdirname) if debug_only:
for po_file in po_files: print(po_to_text(str(po_file), drop_capitalized))
if debug_only: continue
print(po_to_text(str(po_file), drop_capitalized)) text_for_hunspell = po_to_text(str(po_file), drop_capitalized)
continue try:
(tmpdir / po_file.name).write_text( output = subprocess.run(
po_to_text(str(po_file), drop_capitalized) ["hunspell", "-d", language, "-l"] + personal_dict_arg,
universal_newlines=True,
input=text_for_hunspell,
stdout=subprocess.PIPE,
) )
try: except subprocess.CalledProcessError:
output = subprocess.check_output( return -1
["hunspell", "-d", language] if not output.stdout:
+ personal_dict_arg continue # No errors :)
+ ["-u3", str(tmpdir / po_file.name)], line_of_words = defaultdict(set)
universal_newlines=True, for line, text in enumerate(text_for_hunspell.split("\n"), start=1):
) for word in text.split():
except subprocess.CalledProcessError: line_of_words[word].add(line)
return -1 for misspelled_word in set(output.stdout.split("\n")):
for line in output.split("\n"): for line_number in line_of_words[misspelled_word]:
match = regex.match( errors.append((po_file, line_number, misspelled_word))
r"(?P<path>.*):(?P<line>[0-9]+): Locate: (?P<error>.*) \| Try: .*$", errors.sort()
line, for error in errors:
) print(":".join(str(token) for token in error))
if match: return len(errors)
errors += 1
print(po_file, match.group("line"), match.group("error"), sep=":")
return errors
def gracefull_handling_of_missing_dicts(language): def gracefull_handling_of_missing_dicts(language):

View File

@ -3,95 +3,70 @@ from pospell import clear, strip_rst
def test_clear(): def test_clear():
# We don't remove legitimally capitalized first words: # We don't remove legitimally capitalized first words:
assert clear("test", "Sport is great.") == "Sport is great." assert clear("Sport is great.") == "Sport is great."
assert clear("test", "Sport is great.", drop_capitalized=True) == "Sport is great." assert clear("Sport is great.", True) == "Sport is great."
# Sometimes we can't guess it's a firstname: # Sometimes we can't guess it's a firstname:
assert clear("test", "Julien Palard teste.") == "Julien Palard teste." assert clear("Julien Palard teste.") == "Julien Palard teste."
assert ( assert "Palard" not in clear("Julien Palard teste.", True)
clear("test", "Julien Palard teste.", drop_capitalized=True) == "Julien teste."
)
# We remove capitalized words in the middle of a sentence # We remove capitalized words in the middle of a sentence
# they are typically names # they are typically names
assert clear("test", "Great is Unicode.") == "Great is Unicode." assert clear("Great is Unicode.") == "Great is Unicode."
assert clear("test", "Great is Unicode.", drop_capitalized=True) == "Great is ." assert "Unicode" not in clear("Great is Unicode.", True)
# We remove capitalized words even prefixed with l' in french. # We remove capitalized words even prefixed with l' in french.
assert ( assert clear("Bah si, l'Unicode c'est bien.") == "Bah si, l'Unicode c'est bien."
clear("test", "Bah si, l'Unicode c'est bien.") assert "Unicode" not in clear("Bah si, l'Unicode c'est bien.", True)
== "Bah si, l'Unicode c'est bien."
)
assert (
clear("test", "Bah si, l'Unicode c'est bien.", drop_capitalized=True)
== "Bah si, c'est bien."
)
# We remove single letters in quotes # We remove single letters in quotes
assert clear("test", "La lettre « é » est seule.") == "La lettre est seule." assert "é" not in clear("La lettre « é » est seule.")
# We remove soft hyphens # We remove soft hyphens
assert clear("test", "some\xadthing") == "something" assert clear("some\xadthing") == "something"
# We drop hours because hunspell whines on them # We drop hours because hunspell whines on them
assert clear("test", "Rendez-vous à 10h chez Murex") == "Rendez-vous à chez Murex" assert "10h" not in clear("Rendez-vous à 10h chez Murex")
# When we removed a dashed name, remove it all # When we removed a dashed name, remove it all
assert clear("test", "Marc-André Lemburg a fait") != "Marc- Lemburg a fait" assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait"
assert "Marc-André" in clear("Marc-André Lemburg a fait", True)
assert "Lemburg" not in clear("Marc-André Lemburg a fait", True)
# Even in the middle of a sentence # Even in the middle of a sentence
assert ( assert clear("Hier, Marc-André Lemburg a fait") == "Hier, Marc-André Lemburg a fait"
clear("test", "Hier, Marc-André Lemburg a fait") assert "Marc-André" not in clear("Hier, Marc-André Lemburg a fait", True)
== "Hier, Marc-André Lemburg a fait" assert "André" not in clear("Hier, Marc-André Lemburg a fait", True)
) assert "Marc" not in clear("Hier, Marc-André Lemburg a fait", True)
assert "Lemburg" not in clear("Hier, Marc-André Lemburg a fait", True)
# We remove variables # We remove variables
assert clear("test", "Starting {days_since} days ago") == "Starting days ago" assert "days_since" not in clear("Starting {days_since} days ago")
# Drop PEP 440 versions # Drop PEP 440 versions
assert ( assert "1.6a1" not in clear("under python 1.6a1, 1.5.2, and earlier.")
clear("test", "under python 1.6a1, 1.5.2, and earlier.") assert "1.5.2" not in clear("under python 1.6a1, 1.5.2, and earlier.")
== "under python , , and earlier."
)
# Double space should change nothing # Double space should change nothing
assert clear("test", "Test. Aujourd'hui, j'ai faim.") == clear( assert clear("Test. Aujourd'hui, j'ai faim.") == clear(
"test", "Test. Aujourd'hui, j'ai faim." "Test. Aujourd'hui, j'ai faim."
) )
assert ( assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers"))
clear("test", strip_rst(":pep:`305` - Interface des fichiers"))
== "Interface des fichiers"
)
def test_clear_accronyms(): def test_clear_accronyms():
for drop_capitalized in True, False: for drop_capitalized in True, False:
# We always drop accronyms # We always drop accronyms
assert ( assert "HTTP" not in clear("HTTP is great.", drop_capitalized)
clear("test", "HTTP is great.", drop_capitalized=drop_capitalized)
== " is great."
)
# Even suffixed with a number # Even suffixed with a number
assert ( assert "POSIX.1" not in clear("POSIX.1 is great.", drop_capitalized)
clear("test", "POSIX.1 is great.", drop_capitalized=drop_capitalized)
== " is great."
)
# Correctly drop prefix of accronyms # Correctly drop prefix of accronyms
assert ( assert "non-HTTP" not in clear("non-HTTP is bad.", drop_capitalized)
clear("test", "non-HTTP is bad.", drop_capitalized=drop_capitalized)
== " is bad."
)
# Also skip accronyms in the middle of a sentence # Also skip accronyms in the middle of a sentence
assert ( assert "HTTP" not in clear("Yes HTTP is great.", drop_capitalized)
clear("test", "Yes HTTP is great.", drop_capitalized=drop_capitalized)
== "Yes is great."
)
assert ( assert "PEPs" not in clear("Ho. PEPs good.", drop_capitalized)
clear("", "Ho. PEPs good.", drop_capitalized=drop_capitalized)
== "Ho. good."
)