Faster implementation (~twice faster on python-docs-fr).

This commit is contained in:
Julien Palard 2020-10-11 23:00:30 +02:00
parent 60730425d8
commit 5db3a839cb
8 changed files with 107 additions and 107 deletions

View File

@ -2,6 +2,8 @@
""" """
from collections import defaultdict from collections import defaultdict
import io import io
from string import digits
from unicodedata import category
import logging import logging
import subprocess import subprocess
import sys import sys
@ -108,9 +110,27 @@ def strip_rst(line):
line = line[:-2] line = line[:-2]
parser = docutils.parsers.rst.Parser() parser = docutils.parsers.rst.Parser()
components = (docutils.parsers.rst.Parser,) components = (docutils.parsers.rst.Parser,)
settings = docutils.frontend.OptionParser( settings = docutils.frontend.Values(
components=components {
).get_default_values() "report_level": 2,
"halt_level": 4,
"exit_status_level": 5,
"debug": None,
"warning_stream": None,
"error_encoding": "utf-8",
"error_encoding_error_handler": "backslashreplace",
"language_code": "en",
"id_prefix": "",
"auto_id_prefix": "id",
"pep_references": None,
"pep_base_url": "http://www.python.org/dev/peps/",
"pep_file_url_template": "pep-%04d",
"rfc_references": None,
"rfc_base_url": "http://tools.ietf.org/html/",
"tab_width": 8,
"trim_footnote_reference_space": None,
}
)
stderr_stringio = io.StringIO() stderr_stringio = io.StringIO()
with redirect_stderr(stderr_stringio): with redirect_stderr(stderr_stringio):
document = new_document("<rst-doc>", settings=settings) document = new_document("<rst-doc>", settings=settings)
@ -130,15 +150,7 @@ def clear(line, drop_capitalized=False, po_path=""):
to_drop = { to_drop = {
r'<a href="[^"]*?">', r'<a href="[^"]*?">',
# Strip accronyms
r"\b[\w-]*\p{Uppercase}{2,}[0-9.\w-]*\b",
r"---?", # -- and --- separators to be ignored
r" - ", # Drop lone dashes (sometimes used in place of -- or ---)
r"-\\ ", # Ignore "MINUS BACKSLASH SPACE" typically used in
# formulas, like '-\ *π*' but *π* gets removed too
r"{[a-z_]*?}", # Sphinx variable r"{[a-z_]*?}", # Sphinx variable
r"'?-?\b([0-9]+\.)*[0-9]+\.[0-9abcrx]+\b'?", # Versions
r"[0-9]+h", # Hours
r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable r"%\([a-z_]+?\)[diouxXeEfFgGcrsa%]", # Sphinx variable
r"« . »", # Single letter examples (typically in Unicode documentation) r"« . »", # Single letter examples (typically in Unicode documentation)
} }
@ -245,10 +257,25 @@ def parse_args():
return args return args
def look_like_a_word(word):
"""Used to filter out non-words like `---` or `-0700` so they don't
get reported. They typically are not errors.
"""
if not word:
return False
if any(digit in word for digit in digits):
return False
if len([c for c in word if category(c) == "Lu"]) > 1:
return False # Probably an accronym, or a name like CPython, macOS, SQLite, ...
if "-" in word:
return False
return True
def spell_check( def spell_check(
po_files, po_files,
personal_dict=None, personal_dict=None,
language="en_EN", language="en_US",
drop_capitalized=False, drop_capitalized=False,
debug_only=False, debug_only=False,
): ):
@ -260,24 +287,27 @@ def spell_check(
""" """
errors = [] errors = []
personal_dict_arg = ["-p", personal_dict] if personal_dict else [] personal_dict_arg = ["-p", personal_dict] if personal_dict else []
texts_for_hunspell = {}
for po_file in po_files: for po_file in po_files:
if debug_only: if debug_only:
print(po_to_text(str(po_file), drop_capitalized)) print(po_to_text(str(po_file), drop_capitalized))
continue continue
text_for_hunspell = po_to_text(str(po_file), drop_capitalized) texts_for_hunspell[po_file] = po_to_text(str(po_file), drop_capitalized)
try: try:
output = subprocess.run( output = subprocess.run(
["hunspell", "-d", language, "-l"] + personal_dict_arg, ["hunspell", "-d", language, "-l"] + personal_dict_arg,
universal_newlines=True, universal_newlines=True,
input=text_for_hunspell, input="\n".join(texts_for_hunspell.values()),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
) )
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
return -1 return -1
if not output.stdout: if not output.stdout:
continue # No errors :) return 0
line_of_words = defaultdict(set) for misspelled_word in {
for misspelled_word in {word for word in output.stdout.split("\n") if word}: word for word in output.stdout.split("\n") if look_like_a_word(word)
}:
for po_file, text_for_hunspell in texts_for_hunspell.items():
for line_number, line in enumerate(text_for_hunspell.split("\n"), start=1): for line_number, line in enumerate(text_for_hunspell.split("\n"), start=1):
if misspelled_word in line: if misspelled_word in line:
errors.append((po_file, line_number, misspelled_word)) errors.append((po_file, line_number, misspelled_word))

View File

@ -0,0 +1,2 @@
msgid "Python doc is translated!"
msgstr "Python doc is tranlsated!"

View File

@ -0,0 +1,17 @@
# SOME DESCRIPTIVE TITLE.
# Copyright (C) YEAR Free Software Foundation, Inc.
# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
#
#, fuzzy
msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
"PO-Revision-Date: 2020-10-11 22:08+0200\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <LL@li.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=CHARSET\n"
"Content-Transfer-Encoding: 8bit\n"
msgid "pubb/subb yo"
msgstr "pubb/subb"

View File

@ -0,0 +1,2 @@
msgid "Python doc is translated!"
msgstr "Python doc is translated!"

View File

@ -0,0 +1,14 @@
msgid "HTTP is great."
msgstr "HTTP is great."
msgid "POSIX.1 is great too."
msgstr "POSIX.1 is great too."
msgid "non-HTTP"
msgstr "non-HTTP"
msgid "HTTP"
msgstr "HTTP"
msgid "PEPs"
msgstr "PEPS"

View File

@ -0,0 +1,2 @@
msgid "Rendez-vous à 10h chez Murex"
msgstr "See your at 10h at Murex"

View File

@ -0,0 +1,2 @@
msgid "under python 1.6a1, 1.5.2, and earlier."
msgstr "under python 1.6a1, 1.5.2, and earlier."

View File

@ -1,6 +1,9 @@
import os
from types import SimpleNamespace from types import SimpleNamespace
from pathlib import Path from pathlib import Path
import pytest
from pospell import clear, strip_rst, spell_check from pospell import clear, strip_rst, spell_check
@ -28,9 +31,6 @@ def test_clear():
# We remove soft hyphens # We remove soft hyphens
assert clear("some\xadthing") == "something" assert clear("some\xadthing") == "something"
# We drop hours because hunspell whines on them
assert "10h" not in clear("Rendez-vous à 10h chez Murex")
# When we removed a dashed name, remove it all # When we removed a dashed name, remove it all
assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait" assert clear("Marc-André Lemburg a fait").strip() == "Marc-André Lemburg a fait"
assert "Marc-André" in clear("Marc-André Lemburg a fait", True) assert "Marc-André" in clear("Marc-André Lemburg a fait", True)
@ -46,10 +46,6 @@ def test_clear():
# We remove variables # We remove variables
assert "days_since" not in clear("Starting {days_since} days ago") assert "days_since" not in clear("Starting {days_since} days ago")
# Drop PEP 440 versions
assert "1.6a1" not in clear("under python 1.6a1, 1.5.2, and earlier.")
assert "1.5.2" not in clear("under python 1.6a1, 1.5.2, and earlier.")
# Double space should change nothing # Double space should change nothing
assert clear("Test. Aujourd'hui, j'ai faim.") == clear( assert clear("Test. Aujourd'hui, j'ai faim.") == clear(
"Test. Aujourd'hui, j'ai faim." "Test. Aujourd'hui, j'ai faim."
@ -58,81 +54,16 @@ def test_clear():
assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers")) assert ":pep:`305`" not in clear(strip_rst(":pep:`305` - Interface des fichiers"))
def test_clear_accronyms(): FIXTURE_DIR = Path(__file__).resolve().parent
for drop_capitalized in True, False:
# We always drop accronyms
assert "HTTP" not in clear("HTTP is great.", drop_capitalized)
# Even suffixed with a number
assert "POSIX.1" not in clear("POSIX.1 is great.", drop_capitalized)
# Correctly drop prefix of accronyms
assert "non-HTTP" not in clear("non-HTTP is bad.", drop_capitalized)
# Also skip accronyms in the middle of a sentence
assert "HTTP" not in clear("Yes HTTP is great.", drop_capitalized)
assert "PEPs" not in clear("Ho. PEPs good.", drop_capitalized)
def test_with_an_error(tmp_path, capsys, monkeypatch): @pytest.mark.parametrize("po_file", (FIXTURE_DIR / "expected_to_fail").glob("*.po"))
import subprocess def test_expected_to_fail(po_file, capsys):
assert spell_check([po_file]) > 0
tmp_path = Path(tmp_path) assert not capsys.readouterr().err
monkeypatch.setattr(
subprocess,
"run",
lambda *args, **kwargs: SimpleNamespace(stdout="Pyhton\n"),
)
(tmp_path / "test.po").write_text(
"""
msgid "Python FTW!"
msgstr "Gloire à Pyhton !"
"""
)
assert spell_check([tmp_path / "test.po"]) > 0
captured = capsys.readouterr()
assert "Pyhton" in captured.out
assert not captured.err
def test_with_no_error(tmp_path, capsys, monkeypatch): @pytest.mark.parametrize("po_file", (FIXTURE_DIR / "expected_to_success").glob("*.po"))
import subprocess def test_expected_to_success(po_file, capsys):
assert spell_check([po_file]) == 0
tmp_path = Path(tmp_path) assert not capsys.readouterr().err
monkeypatch.setattr(
subprocess,
"run",
lambda *args, **kwargs: SimpleNamespace(stdout=""),
)
(tmp_path / "test.po").write_text(
"""
msgid "Python FTW!"
msgstr "Gloire à Python !"
"""
)
assert spell_check([tmp_path / "test.po"]) == 0
captured = capsys.readouterr()
assert not captured.out
assert not captured.err
def test_issue_19(tmp_path, capsys, monkeypatch):
import subprocess
tmp_path = Path(tmp_path)
monkeypatch.setattr(
subprocess,
"run",
lambda *args, **kwargs: SimpleNamespace(stdout="pubb\nsubb\n"),
)
(tmp_path / "test.po").write_text(
"""
msgid "pubb/subb yo"
msgstr "pubb/subb"
"""
)
assert spell_check([tmp_path / "test.po"]) > 0
captured = capsys.readouterr()
assert "pubb" in captured.out
assert not captured.err