Faster implementation (~twice faster on python-docs-fr).
This commit is contained in:
parent
60730425d8
commit
5db3a839cb
82
pospell.py
82
pospell.py
|
@ -2,6 +2,8 @@
|
||||||
"""
|
"""
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import io
|
import io
|
||||||
|
from string import digits
|
||||||
|
from unicodedata import category
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
@ -108,9 +110,27 @@ def strip_rst(line):
|
||||||
line = line[:-2]
|
line = line[:-2]
|
||||||
parser = docutils.parsers.rst.Parser()
|
parser = docutils.parsers.rst.Parser()
|
||||||
components = (docutils.parsers.rst.Parser,)
|
components = (docutils.parsers.rst.Parser,)
|
||||||
settings = docutils.frontend.OptionParser(
|
settings = docutils.frontend.Values(
|
||||||
components=components
|
{
|
||||||
).get_default_values()
|
"report_level": 2,
|
||||||
|
"halt_level": 4,
|
||||||
|
"exit_status_level": 5,
|
||||||
|
"debug": None,
|
||||||
|
"warning_stream": None,
|
||||||
|
"error_encoding": "utf-8",
|
||||||
|
"error_encoding_error_handler": "backslashreplace",
|
||||||
|
"language_code": "en",
|
||||||
|
"id_prefix": "",
|
||||||
|
"auto_id_prefix": "id",
|
||||||
|
"pep_references": None,
|
||||||
|
"pep_base_url": "http://www.python.org/dev/peps/",
|
||||||
|
"pep_file_url_template": "pep-%04d",
|
||||||
|
"rfc_references": None,
|
||||||
|
"rfc_base_url": "http://tools.ietf.org/html/",
|
||||||
|
"tab_width": 8,
|
||||||
|
"trim_footnote_reference_space": None,
|
||||||
|
}
|
||||||
|
)
|
||||||
stderr_stringio = io.StringIO()
|
stderr_stringio = io.StringIO()
|
||||||
with redirect_stderr(stderr_stringio):
|
with redirect_stderr(stderr_stringio):
|
||||||
document = new_document("<rst-doc>", settings=settings)
|
document = new_document("<rst-doc>", settings=settings)
|
||||||
|
@ -130,15 +150,7 @@ def clear(line, drop_capitalized=False, po_path=""):
|
||||||
|
|
||||||
to_drop = {
|
to_drop = {
|
||||||
r'<a href="[^"]*?">',
|
r'<a href="[^"]*?">',
|
||||||
# Strip accronyms
|
|
||||||
r"\b[\w-]*\p{Uppercase}{2,}[0-9.\w-]*\b",
|
|
||||||
r"---?", # -- and --- separators to be ignored
|
|
||||||
r" - ", # Drop lone dashes (sometimes used in place of -- or ---)
|
|
||||||
r"-\\ ", # Ignore "MINUS BACKSLASH SPACE" typically used in
|
|
||||||
# formulas, like '-\ *π*' but *π* gets removed too
|
|
||||||
r"{[a-z_]*?}", # Sphinx variable
|
r"{[a-z_]*?}", # Sphinx variable
|
||||||
r"'?-?\b([0-9]+\.)*[0-9]+\.[0-9abcrx]+\b'?", # Versions
|
|
||||||
r"[0-9]+h", # Hours
|
|
||||||
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
|
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
|
||||||
r"« . »", # Single letter examples (typically in Unicode documentation)
|
r"« . »", # Single letter examples (typically in Unicode documentation)
|
||||||
}
|
}
|
||||||
|
@ -245,10 +257,25 @@ def parse_args():
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def look_like_a_word(word):
|
||||||
|
"""Used to filter out non-words like `---` or `-0700` so they don't
|
||||||
|
get reported. They typically are not errors.
|
||||||
|
"""
|
||||||
|
if not word:
|
||||||
|
return False
|
||||||
|
if any(digit in word for digit in digits):
|
||||||
|
return False
|
||||||
|
if len([c for c in word if category(c) == "Lu"]) > 1:
|
||||||
|
return False # Probably an accronym, or a name like CPython, macOS, SQLite, ...
|
||||||
|
if "-" in word:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def spell_check(
|
def spell_check(
|
||||||
po_files,
|
po_files,
|
||||||
personal_dict=None,
|
personal_dict=None,
|
||||||
language="en_EN",
|
language="en_US",
|
||||||
drop_capitalized=False,
|
drop_capitalized=False,
|
||||||
debug_only=False,
|
debug_only=False,
|
||||||
):
|
):
|
||||||
|
@ -260,24 +287,27 @@ def spell_check(
|
||||||
"""
|
"""
|
||||||
errors = []
|
errors = []
|
||||||
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
|
personal_dict_arg = ["-p", personal_dict] if personal_dict else []
|
||||||
|
texts_for_hunspell = {}
|
||||||
for po_file in po_files:
|
for po_file in po_files:
|
||||||
if debug_only:
|
if debug_only:
|
||||||
print(po_to_text(str(po_file), drop_capitalized))
|
print(po_to_text(str(po_file), drop_capitalized))
|
||||||
continue
|
continue
|
||||||
text_for_hunspell = po_to_text(str(po_file), drop_capitalized)
|
texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized)
|
||||||
try:
|
try:
|
||||||
output = subprocess.run(
|
output = subprocess.run(
|
||||||
["hunspell", "-d", language, "-l"] + personal_dict_arg,
|
["hunspell", "-d", language, "-l"] + personal_dict_arg,
|
||||||
universal_newlines=True,
|
universal_newlines=True,
|
||||||
input=text_for_hunspell,
|
input="\n".join(texts_for_hunspell.values()),
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
)
|
)
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
return -1
|
return -1
|
||||||
if not output.stdout:
|
if not output.stdout:
|
||||||
continue # No errors :)
|
return 0
|
||||||
line_of_words = defaultdict(set)
|
for misspelled_word in {
|
||||||
for misspelled_word in {word for word in output.stdout.split("\n") if word}:
|
word for word in output.stdout.split("\n") if look_like_a_word(word)
|
||||||
|
}:
|
||||||
|
for po_file, text_for_hunspell in texts_for_hunspell.items():
|
||||||
for line_number, line in enumerate(text_for_hunspell.split("\n"), start=1):
|
for line_number, line in enumerate(text_for_hunspell.split("\n"), start=1):
|
||||||
if misspelled_word in line:
|
if misspelled_word in line:
|
||||||
errors.append((po_file, line_number, misspelled_word))
|
errors.append((po_file, line_number, misspelled_word))
|
||||||
|
|
2
tests/expected_to_fail/1.po
Normal file
2
tests/expected_to_fail/1.po
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
msgid "Python doc is translated!"
|
||||||
|
msgstr "Python doc is tranlsated!"
|
17
tests/expected_to_fail/issue19.po
Normal file
17
tests/expected_to_fail/issue19.po
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# SOME DESCRIPTIVE TITLE.
|
||||||
|
# Copyright (C) YEAR Free Software Foundation, Inc.
|
||||||
|
# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
|
||||||
|
#
|
||||||
|
#, fuzzy
|
||||||
|
msgid ""
|
||||||
|
msgstr ""
|
||||||
|
"Project-Id-Version: PACKAGE VERSION\n"
|
||||||
|
"PO-Revision-Date: 2020-10-11 22:08+0200\n"
|
||||||
|
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
|
"Language-Team: LANGUAGE <LL@li.org>\n"
|
||||||
|
"MIME-Version: 1.0\n"
|
||||||
|
"Content-Type: text/plain; charset=CHARSET\n"
|
||||||
|
"Content-Transfer-Encoding: 8bit\n"
|
||||||
|
|
||||||
|
msgid "pubb/subb yo"
|
||||||
|
msgstr "pubb/subb"
|
2
tests/expected_to_success/1.po
Normal file
2
tests/expected_to_success/1.po
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
msgid "Python doc is translated!"
|
||||||
|
msgstr "Python doc is translated!"
|
14
tests/expected_to_success/accronyms.po
Normal file
14
tests/expected_to_success/accronyms.po
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
msgid "HTTP is great."
|
||||||
|
msgstr "HTTP is great."
|
||||||
|
|
||||||
|
msgid "POSIX.1 is great too."
|
||||||
|
msgstr "POSIX.1 is great too."
|
||||||
|
|
||||||
|
msgid "non-HTTP"
|
||||||
|
msgstr "non-HTTP"
|
||||||
|
|
||||||
|
msgid "HTTP"
|
||||||
|
msgstr "HTTP"
|
||||||
|
|
||||||
|
msgid "PEPs"
|
||||||
|
msgstr "PEPS"
|
2
tests/expected_to_success/hour.po
Normal file
2
tests/expected_to_success/hour.po
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
msgid "Rendez-vous à 10h chez Murex"
|
||||||
|
msgstr "See your at 10h at Murex"
|
2
tests/expected_to_success/versions.po
Normal file
2
tests/expected_to_success/versions.po
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
msgid "under python 1.6a1, 1.5.2, and earlier."
|
||||||
|
msgstr "under python 1.6a1, 1.5.2, and earlier."
|
|
@ -1,6 +1,9 @@
|
||||||
|
import os
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from pospell import clear, strip_rst, spell_check
|
from pospell import clear, strip_rst, spell_check
|
||||||
|
|
||||||
|
|
||||||
|
@ -28,9 +31,6 @@ def test_clear():
|
||||||
# We remove soft hyphens
|
# We remove soft hyphens
|
||||||
assert clear("some\xadthing") == "something"
|
assert clear("some\xadthing") == "something"
|
||||||
|
|
||||||
# We drop hours because hunspell whines on them
|
|
||||||
assert "10h" not in clear("Rendez-vous à 10h chez Murex")
|
|
||||||
|
|
||||||
# When we removed a dashed name, remove it all
|
# When we removed a dashed name, remove it all
|
||||||
assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait"
|
assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait"
|
||||||
assert "Marc-André" in clear("Marc-André Lemburg a fait", True)
|
assert "Marc-André" in clear("Marc-André Lemburg a fait", True)
|
||||||
|
@ -46,10 +46,6 @@ def test_clear():
|
||||||
# We remove variables
|
# We remove variables
|
||||||
assert "days_since" not in clear("Starting {days_since} days ago")
|
assert "days_since" not in clear("Starting {days_since} days ago")
|
||||||
|
|
||||||
# Drop PEP 440 versions
|
|
||||||
assert "1.6a1" not in clear("under python 1.6a1, 1.5.2, and earlier.")
|
|
||||||
assert "1.5.2" not in clear("under python 1.6a1, 1.5.2, and earlier.")
|
|
||||||
|
|
||||||
# Double space should change nothing
|
# Double space should change nothing
|
||||||
assert clear("Test. Aujourd'hui, j'ai faim.") == clear(
|
assert clear("Test. Aujourd'hui, j'ai faim.") == clear(
|
||||||
"Test. Aujourd'hui, j'ai faim."
|
"Test. Aujourd'hui, j'ai faim."
|
||||||
|
@ -58,81 +54,16 @@ def test_clear():
|
||||||
assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers"))
|
assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers"))
|
||||||
|
|
||||||
|
|
||||||
def test_clear_accronyms():
|
FIXTURE_DIR = Path(__file__).resolve().parent
|
||||||
for drop_capitalized in True, False:
|
|
||||||
# We always drop accronyms
|
|
||||||
assert "HTTP" not in clear("HTTP is great.", drop_capitalized)
|
|
||||||
|
|
||||||
# Even suffixed with a number
|
|
||||||
assert "POSIX.1" not in clear("POSIX.1 is great.", drop_capitalized)
|
|
||||||
|
|
||||||
# Correctly drop prefix of accronyms
|
|
||||||
assert "non-HTTP" not in clear("non-HTTP is bad.", drop_capitalized)
|
|
||||||
|
|
||||||
# Also skip accronyms in the middle of a sentence
|
|
||||||
assert "HTTP" not in clear("Yes HTTP is great.", drop_capitalized)
|
|
||||||
|
|
||||||
assert "PEPs" not in clear("Ho. PEPs good.", drop_capitalized)
|
|
||||||
|
|
||||||
|
|
||||||
def test_with_an_error(tmp_path, capsys, monkeypatch):
|
@pytest.mark.parametrize("po_file", (FIXTURE_DIR / "expected_to_fail").glob("*.po"))
|
||||||
import subprocess
|
def test_expected_to_fail(po_file, capsys):
|
||||||
|
assert spell_check([po_file]) > 0
|
||||||
tmp_path = Path(tmp_path)
|
assert not capsys.readouterr().err
|
||||||
monkeypatch.setattr(
|
|
||||||
subprocess,
|
|
||||||
"run",
|
|
||||||
lambda *args, **kwargs: SimpleNamespace(stdout="Pyhton\n"),
|
|
||||||
)
|
|
||||||
(tmp_path / "test.po").write_text(
|
|
||||||
"""
|
|
||||||
msgid "Python FTW!"
|
|
||||||
msgstr "Gloire à Pyhton !"
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
assert spell_check([tmp_path / "test.po"]) > 0
|
|
||||||
captured = capsys.readouterr()
|
|
||||||
assert "Pyhton" in captured.out
|
|
||||||
assert not captured.err
|
|
||||||
|
|
||||||
|
|
||||||
def test_with_no_error(tmp_path, capsys, monkeypatch):
|
@pytest.mark.parametrize("po_file", (FIXTURE_DIR / "expected_to_success").glob("*.po"))
|
||||||
import subprocess
|
def test_expected_to_success(po_file, capsys):
|
||||||
|
assert spell_check([po_file]) == 0
|
||||||
tmp_path = Path(tmp_path)
|
assert not capsys.readouterr().err
|
||||||
monkeypatch.setattr(
|
|
||||||
subprocess,
|
|
||||||
"run",
|
|
||||||
lambda *args, **kwargs: SimpleNamespace(stdout=""),
|
|
||||||
)
|
|
||||||
(tmp_path / "test.po").write_text(
|
|
||||||
"""
|
|
||||||
msgid "Python FTW!"
|
|
||||||
msgstr "Gloire à Python !"
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
assert spell_check([tmp_path / "test.po"]) == 0
|
|
||||||
captured = capsys.readouterr()
|
|
||||||
assert not captured.out
|
|
||||||
assert not captured.err
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue_19(tmp_path, capsys, monkeypatch):
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
tmp_path = Path(tmp_path)
|
|
||||||
monkeypatch.setattr(
|
|
||||||
subprocess,
|
|
||||||
"run",
|
|
||||||
lambda *args, **kwargs: SimpleNamespace(stdout="pubb\nsubb\n"),
|
|
||||||
)
|
|
||||||
(tmp_path / "test.po").write_text(
|
|
||||||
"""
|
|
||||||
msgid "pubb/subb yo"
|
|
||||||
msgstr "pubb/subb"
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
assert spell_check([tmp_path / "test.po"]) > 0
|
|
||||||
captured = capsys.readouterr()
|
|
||||||
assert "pubb" in captured.out
|
|
||||||
assert not captured.err
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user