parent
9c93c50106
commit
f90ac406af
64
pospell.py
64
pospell.py
|
@ -1,5 +1,6 @@
|
||||||
"""pospell is a spellcheckers for po files containing reStructuedText.
|
"""pospell is a spellcheckers for po files containing reStructuedText.
|
||||||
"""
|
"""
|
||||||
|
from collections import defaultdict
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
|
@ -23,6 +24,7 @@ __version__ = "1.0.4"
|
||||||
|
|
||||||
DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
|
DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True}
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
HUNSPELL_VERSION = subprocess.check_output(
|
HUNSPELL_VERSION = subprocess.check_output(
|
||||||
["hunspell", "--version"], universal_newlines=True
|
["hunspell", "--version"], universal_newlines=True
|
||||||
|
@ -123,11 +125,12 @@ def strip_rst(line):
|
||||||
return str(visitor)
|
return str(visitor)
|
||||||
|
|
||||||
|
|
||||||
def clear(po_path, line, drop_capitalized=False):
|
def clear(line, drop_capitalized=False, po_path=""):
|
||||||
"""Clear various other syntaxes we may encounter in a line.
|
"""Clear various other syntaxes we may encounter in a line.
|
||||||
"""
|
"""
|
||||||
# Normalize spaces
|
# Normalize spaces
|
||||||
line = regex.sub(r"\s+", " ", line)
|
line = regex.sub(r"\s+", " ", line).replace("\xad", "")
|
||||||
|
|
||||||
to_drop = {
|
to_drop = {
|
||||||
r'<a href="[^"]*?">',
|
r'<a href="[^"]*?">',
|
||||||
# Strip accronyms
|
# Strip accronyms
|
||||||
|
@ -141,7 +144,6 @@ def clear(po_path, line, drop_capitalized=False):
|
||||||
r"[0-9]+h", # Hours
|
r"[0-9]+h", # Hours
|
||||||
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
|
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
|
||||||
r"« . »", # Single letter examples (typically in Unicode documentation)
|
r"« . »", # Single letter examples (typically in Unicode documentation)
|
||||||
"\xad", # soft hyphen
|
|
||||||
}
|
}
|
||||||
if drop_capitalized:
|
if drop_capitalized:
|
||||||
to_drop.add(
|
to_drop.add(
|
||||||
|
@ -175,7 +177,7 @@ def po_to_text(po_path, drop_capitalized=False):
|
||||||
while lines < entry.linenum:
|
while lines < entry.linenum:
|
||||||
buffer.append("")
|
buffer.append("")
|
||||||
lines += 1
|
lines += 1
|
||||||
buffer.append(clear(po_path, strip_rst(entry.msgstr), drop_capitalized))
|
buffer.append(clear(strip_rst(entry.msgstr), drop_capitalized, po_path=po_path))
|
||||||
lines += 1
|
lines += 1
|
||||||
return "\n".join(buffer)
|
return "\n".join(buffer)
|
||||||
|
|
||||||
|
@ -256,35 +258,35 @@ def spell_check(
|
||||||
|
|
||||||
Debug only will show what's passed to Hunspell instead of passing it.
|
Debug only will show what's passed to Hunspell instead of passing it.
|
||||||
"""
|
"""
|
||||||
errors = 0
|
errors = []
|
||||||
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
|
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
|
||||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
for po_file in po_files:
|
||||||
tmpdir = Path(tmpdirname)
|
if debug_only:
|
||||||
for po_file in po_files:
|
print(po_to_text(str(po_file), drop_capitalized))
|
||||||
if debug_only:
|
continue
|
||||||
print(po_to_text(str(po_file), drop_capitalized))
|
text_for_hunspell = po_to_text(str(po_file), drop_capitalized)
|
||||||
continue
|
try:
|
||||||
(tmpdir / po_file.name).write_text(
|
output = subprocess.run(
|
||||||
po_to_text(str(po_file), drop_capitalized)
|
["hunspell", "-d", language, "-l"] + personal_dict_arg,
|
||||||
|
universal_newlines=True,
|
||||||
|
input=text_for_hunspell,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
)
|
)
|
||||||
try:
|
except subprocess.CalledProcessError:
|
||||||
output = subprocess.check_output(
|
return -1
|
||||||
["hunspell", "-d", language]
|
if not output.stdout:
|
||||||
+ personal_dict_arg
|
continue # No errors :)
|
||||||
+ ["-u3", str(tmpdir / po_file.name)],
|
line_of_words = defaultdict(set)
|
||||||
universal_newlines=True,
|
for line, text in enumerate(text_for_hunspell.split("\n"), start=1):
|
||||||
)
|
for word in text.split():
|
||||||
except subprocess.CalledProcessError:
|
line_of_words[word].add(line)
|
||||||
return -1
|
for misspelled_word in set(output.stdout.split("\n")):
|
||||||
for line in output.split("\n"):
|
for line_number in line_of_words[misspelled_word]:
|
||||||
match = regex.match(
|
errors.append((po_file, line_number, misspelled_word))
|
||||||
r"(?P<path>.*):(?P<line>[0-9]+): Locate: (?P<error>.*) \| Try: .*$",
|
errors.sort()
|
||||||
line,
|
for error in errors:
|
||||||
)
|
print(":".join(str(token) for token in error))
|
||||||
if match:
|
return len(errors)
|
||||||
errors += 1
|
|
||||||
print(po_file, match.group("line"), match.group("error"), sep=":")
|
|
||||||
return errors
|
|
||||||
|
|
||||||
|
|
||||||
def gracefull_handling_of_missing_dicts(language):
|
def gracefull_handling_of_missing_dicts(language):
|
||||||
|
|
|
@ -3,95 +3,70 @@ from pospell import clear, strip_rst
|
||||||
|
|
||||||
def test_clear():
|
def test_clear():
|
||||||
# We don't remove legitimally capitalized first words:
|
# We don't remove legitimally capitalized first words:
|
||||||
assert clear("test", "Sport is great.") == "Sport is great."
|
assert clear("Sport is great.") == "Sport is great."
|
||||||
assert clear("test", "Sport is great.", drop_capitalized=True) == "Sport is great."
|
assert clear("Sport is great.", True) == "Sport is great."
|
||||||
|
|
||||||
# Sometimes we can't guess it's a firstname:
|
# Sometimes we can't guess it's a firstname:
|
||||||
assert clear("test", "Julien Palard teste.") == "Julien Palard teste."
|
assert clear("Julien Palard teste.") == "Julien Palard teste."
|
||||||
assert (
|
assert "Palard" not in clear("Julien Palard teste.", True)
|
||||||
clear("test", "Julien Palard teste.", drop_capitalized=True) == "Julien teste."
|
|
||||||
)
|
|
||||||
|
|
||||||
# We remove capitalized words in the middle of a sentence
|
# We remove capitalized words in the middle of a sentence
|
||||||
# they are typically names
|
# they are typically names
|
||||||
assert clear("test", "Great is Unicode.") == "Great is Unicode."
|
assert clear("Great is Unicode.") == "Great is Unicode."
|
||||||
assert clear("test", "Great is Unicode.", drop_capitalized=True) == "Great is ."
|
assert "Unicode" not in clear("Great is Unicode.", True)
|
||||||
|
|
||||||
# We remove capitalized words even prefixed with l' in french.
|
# We remove capitalized words even prefixed with l' in french.
|
||||||
assert (
|
assert clear("Bah si, l'Unicode c'est bien.") == "Bah si, l'Unicode c'est bien."
|
||||||
clear("test", "Bah si, l'Unicode c'est bien.")
|
assert "Unicode" not in clear("Bah si, l'Unicode c'est bien.", True)
|
||||||
== "Bah si, l'Unicode c'est bien."
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
clear("test", "Bah si, l'Unicode c'est bien.", drop_capitalized=True)
|
|
||||||
== "Bah si, c'est bien."
|
|
||||||
)
|
|
||||||
|
|
||||||
# We remove single letters in quotes
|
# We remove single letters in quotes
|
||||||
assert clear("test", "La lettre « é » est seule.") == "La lettre est seule."
|
assert "é" not in clear("La lettre « é » est seule.")
|
||||||
|
|
||||||
# We remove soft hyphens
|
# We remove soft hyphens
|
||||||
assert clear("test", "some\xadthing") == "something"
|
assert clear("some\xadthing") == "something"
|
||||||
|
|
||||||
# We drop hours because hunspell whines on them
|
# We drop hours because hunspell whines on them
|
||||||
assert clear("test", "Rendez-vous à 10h chez Murex") == "Rendez-vous à chez Murex"
|
assert "10h" not in clear("Rendez-vous à 10h chez Murex")
|
||||||
|
|
||||||
# When we removed a dashed name, remove it all
|
# When we removed a dashed name, remove it all
|
||||||
assert clear("test", "Marc-André Lemburg a fait") != "Marc- Lemburg a fait"
|
assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait"
|
||||||
|
assert "Marc-André" in clear("Marc-André Lemburg a fait", True)
|
||||||
|
assert "Lemburg" not in clear("Marc-André Lemburg a fait", True)
|
||||||
|
|
||||||
# Even in the middle of a sentence
|
# Even in the middle of a sentence
|
||||||
assert (
|
assert clear("Hier, Marc-André Lemburg a fait") == "Hier, Marc-André Lemburg a fait"
|
||||||
clear("test", "Hier, Marc-André Lemburg a fait")
|
assert "Marc-André" not in clear("Hier, Marc-André Lemburg a fait", True)
|
||||||
== "Hier, Marc-André Lemburg a fait"
|
assert "André" not in clear("Hier, Marc-André Lemburg a fait", True)
|
||||||
)
|
assert "Marc" not in clear("Hier, Marc-André Lemburg a fait", True)
|
||||||
|
assert "Lemburg" not in clear("Hier, Marc-André Lemburg a fait", True)
|
||||||
|
|
||||||
# We remove variables
|
# We remove variables
|
||||||
assert clear("test", "Starting {days_since} days ago") == "Starting days ago"
|
assert "days_since" not in clear("Starting {days_since} days ago")
|
||||||
|
|
||||||
# Drop PEP 440 versions
|
# Drop PEP 440 versions
|
||||||
assert (
|
assert "1.6a1" not in clear("under python 1.6a1, 1.5.2, and earlier.")
|
||||||
clear("test", "under python 1.6a1, 1.5.2, and earlier.")
|
assert "1.5.2" not in clear("under python 1.6a1, 1.5.2, and earlier.")
|
||||||
== "under python , , and earlier."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Double space should change nothing
|
# Double space should change nothing
|
||||||
assert clear("test", "Test. Aujourd'hui, j'ai faim.") == clear(
|
assert clear("Test. Aujourd'hui, j'ai faim.") == clear(
|
||||||
"test", "Test. Aujourd'hui, j'ai faim."
|
"Test. Aujourd'hui, j'ai faim."
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers"))
|
||||||
clear("test", strip_rst(":pep:`305` - Interface des fichiers"))
|
|
||||||
== "Interface des fichiers"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_clear_accronyms():
|
def test_clear_accronyms():
|
||||||
for drop_capitalized in True, False:
|
for drop_capitalized in True, False:
|
||||||
# We always drop accronyms
|
# We always drop accronyms
|
||||||
assert (
|
assert "HTTP" not in clear("HTTP is great.", drop_capitalized)
|
||||||
clear("test", "HTTP is great.", drop_capitalized=drop_capitalized)
|
|
||||||
== " is great."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Even suffixed with a number
|
# Even suffixed with a number
|
||||||
assert (
|
assert "POSIX.1" not in clear("POSIX.1 is great.", drop_capitalized)
|
||||||
clear("test", "POSIX.1 is great.", drop_capitalized=drop_capitalized)
|
|
||||||
== " is great."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Correctly drop prefix of accronyms
|
# Correctly drop prefix of accronyms
|
||||||
assert (
|
assert "non-HTTP" not in clear("non-HTTP is bad.", drop_capitalized)
|
||||||
clear("test", "non-HTTP is bad.", drop_capitalized=drop_capitalized)
|
|
||||||
== " is bad."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Also skip accronyms in the middle of a sentence
|
# Also skip accronyms in the middle of a sentence
|
||||||
assert (
|
assert "HTTP" not in clear("Yes HTTP is great.", drop_capitalized)
|
||||||
clear("test", "Yes HTTP is great.", drop_capitalized=drop_capitalized)
|
|
||||||
== "Yes is great."
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
assert "PEPs" not in clear("Ho. PEPs good.", drop_capitalized)
|
||||||
clear("", "Ho. PEPs good.", drop_capitalized=drop_capitalized)
|
|
||||||
== "Ho. good."
|
|
||||||
)
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user