pospell/pospell.py

"""pospell is a spellcheckers for po files containing reStructuedText.
"""
import io
import re
import subprocess
import tempfile
from contextlib import redirect_stderr, redirect_stdout
from itertools import chain
from pathlib import Path
from types import SimpleNamespace

import polib

import docutils.frontend
import docutils.nodes
import docutils.parsers.rst
from docutils.parsers.rst import roles
from docutils.utils import new_document


class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
    pass


def monkey_patch_role(role):
    def role_or_generic(role_name, language_module, lineno, reporter):
        base_role, message = role(role_name, language_module, lineno, reporter)
        if base_role is None:
            roles.register_generic_role(role_name, DummyNodeClass)
            base_role, message = role(role_name, language_module, lineno, reporter)
        return base_role, message

    return role_or_generic


roles.role = monkey_patch_role(roles.role)


class NodeToTextVisitor(docutils.nodes.NodeVisitor):
    def __init__(self, document):
        self.output = []
        super().__init__(document)

    def unknown_visit(self, node):
        pass
        # self.output.append(node.__class__.__name__ + ": " + node.rawsource)

    def visit_Text(self, node):
        self.output.append(node.rawsource)

    def __str__(self):
        return " ".join(self.output)


def strip_rst(line):
    if line.endswith("::"):
        # Drop :: at the end, it would cause Literal block expected
        line = line[:-2]
    parser = docutils.parsers.rst.Parser()
    components = (docutils.parsers.rst.Parser,)
    settings = docutils.frontend.OptionParser(
        components=components
    ).get_default_values()
    stderr_stringio = io.StringIO()
    with redirect_stderr(stderr_stringio):
        document = docutils.utils.new_document("<rst-doc>", settings=settings)
        parser.parse(line, document)
    stderr = stderr_stringio.getvalue()
    if stderr:
        print(stderr.strip(), "while parsing:", line)
    visitor = NodeToTextVisitor(document)
    document.walk(visitor)
    return str(visitor)


def clear(line):
    """Clear various other syntaxes we may encounter in a line.
    """
    return re.sub(
        r"""
    <a\ href="[^"]*?">             |  # Strip HTML links
    \b[A-Z][a-zA-Z-]{2,}[a-zA-Z.-]*\b |  # Strip capitalized words and accronyms
    {[a-z]*?}                         |  # Sphinx variable
    %\([a-z_]+?\)s                       # Sphinx variable
    """,
        r"",
        line,
        flags=re.VERBOSE,
    )


def po_to_text(po_path):
    """Converts a po file to a text file, by stripping the msgids and all
    po syntax, but by keeping the kept lines at their same position /
    line number.
    """
    buffer = []
    lines = 0
    entries = polib.pofile(po_path)
    for entry in entries:
        if entry.msgid == entry.msgstr:
            continue
        while lines < entry.linenum:
            buffer.append("")
            lines += 1
        buffer.append(clear(strip_rst(entry.msgstr)))
        lines += 1
    return "\n".join(buffer)


def main():
    """Module entry point.
    """
    import argparse

    parser = argparse.ArgumentParser(
        description="Check spelling in po files containing restructuredText."
    )
    parser.add_argument(
        "-l",
        "--language",
        type=str,
        default="fr",
        help="Language to check, you'll have to install the corresponding "
        "hunspell dictionary, on Debian see apt list 'hunspell-*'.",
    )
    parser.add_argument(
        "--glob",
        type=str,
        help="Provide a glob pattern, to be interpreted by pospell, to find po files, "
        "like --glob '**/*.po'.",
    )
    parser.add_argument(
        "po_file",
        nargs="*",
        type=Path,
        help="Files to check, can optionally be mixed with --glob, or not, "
        "use the one that fit your needs.",
    )
    parser.add_argument("--debug", action="store_true")
    parser.add_argument("-p", "--personal-dict", type=str)
    args = parser.parse_args()
    personal_dict = ["-p", args.personal_dict] if args.personal_dict else []
    errors = 0
    with tempfile.TemporaryDirectory() as tmpdirname:
        tmpdir = Path(tmpdirname)
        for po_file in chain(
            Path(".").glob(args.glob) if args.glob else [], args.po_file
        ):
            if args.debug:
                print(po_to_text(str(po_file)))
                continue
            (tmpdir / po_file.name).write_text(po_to_text(str(po_file)))
            output = subprocess.check_output(
                ["hunspell", "-d", args.language]
                + personal_dict
                + ["-u3", str(tmpdir / po_file.name)],
                universal_newlines=True,
            )
            for line in output.split("\n"):
                match = re.match(
                    r"(?P<path>.*):(?P<line>[0-9]+): Locate: (?P<error>.*) \| Try: .*$",
                    line,
                )
                if match:
                    errors += 1
                    print(
                        match.group("path").replace(str(tmpdir), "").lstrip("/"),
                        match.group("line"),
                        match.group("error"),
                        sep=":",
                    )
    exit(0 if errors == 0 else -1)


if __name__ == "__main__":
    main()
Module docstring. 2018-07-27 13:49:58 +00:00			`"""pospell is a spellcheckers for po files containing reStructuedText.`
			`"""`
Use docutils to parse rst. 2018-07-27 19:57:44 +00:00			`import io`
isort 2018-07-27 09:38:17 +00:00			`import re`
Initial commit 2018-07-23 15:37:50 +00:00			`import subprocess`
isort 2018-07-27 09:38:17 +00:00			`import tempfile`
Use docutils to parse rst. 2018-07-27 19:57:44 +00:00			`from contextlib import redirect_stderr, redirect_stdout`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`from itertools import chain`
Initial commit 2018-07-23 15:37:50 +00:00			`from pathlib import Path`
Use docutils to parse rst. 2018-07-27 19:57:44 +00:00			`from types import SimpleNamespace`
isort 2018-07-27 09:38:17 +00:00
Initial commit 2018-07-23 15:37:50 +00:00			`import polib`

Use docutils to parse rst. 2018-07-27 19:57:44 +00:00			`import docutils.frontend`
			`import docutils.nodes`
			`import docutils.parsers.rst`
			`from docutils.parsers.rst import roles`
			`from docutils.utils import new_document`


			`class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):`
			`pass`


			`def monkey_patch_role(role):`
			`def role_or_generic(role_name, language_module, lineno, reporter):`
			`base_role, message = role(role_name, language_module, lineno, reporter)`
			`if base_role is None:`
			`roles.register_generic_role(role_name, DummyNodeClass)`
			`base_role, message = role(role_name, language_module, lineno, reporter)`
			`return base_role, message`

			`return role_or_generic`


			`roles.role = monkey_patch_role(roles.role)`


			`class NodeToTextVisitor(docutils.nodes.NodeVisitor):`
			`def __init__(self, document):`
			`self.output = []`
			`super().__init__(document)`

			`def unknown_visit(self, node):`
			`pass`
			`# self.output.append(node.__class__.__name__ + ": " + node.rawsource)`

			`def visit_Text(self, node):`
			`self.output.append(node.rawsource)`

			`def __str__(self):`
			`return " ".join(self.output)`

Initial commit 2018-07-23 15:37:50 +00:00
			`def strip_rst(line):`
Use docutils to parse rst. 2018-07-27 19:57:44 +00:00			`if line.endswith("::"):`
			`# Drop :: at the end, it would cause Literal block expected`
			`line = line[:-2]`
			`parser = docutils.parsers.rst.Parser()`
			`components = (docutils.parsers.rst.Parser,)`
			`settings = docutils.frontend.OptionParser(`
			`components=components`
			`).get_default_values()`
			`stderr_stringio = io.StringIO()`
			`with redirect_stderr(stderr_stringio):`
			`document = docutils.utils.new_document("<rst-doc>", settings=settings)`
			`parser.parse(line, document)`
			`stderr = stderr_stringio.getvalue()`
			`if stderr:`
			`print(stderr.strip(), "while parsing:", line)`
			`visitor = NodeToTextVisitor(document)`
			`document.walk(visitor)`
			`return str(visitor)`
Initial commit 2018-07-23 15:37:50 +00:00

Strip sphinx variables and a href links. 2018-07-27 09:01:09 +00:00			`def clear(line):`
			`"""Clear various other syntaxes we may encounter in a line.`
			`"""`
Use docutils to parse rst. 2018-07-27 19:57:44 +00:00			`return re.sub(`
			`r"""`
			`<a\ href="[^"]*?"> \| # Strip HTML links`
			`\b[A-Z][a-zA-Z-]{2,}[a-zA-Z.-]*\b \| # Strip capitalized words and accronyms`
			`{[a-z]*?} \| # Sphinx variable`
			`%\([a-z_]+?\)s # Sphinx variable`
			`""",`
			`r"",`
			`line,`
			`flags=re.VERBOSE,`
			`)`
Strip sphinx variables and a href links. 2018-07-27 09:01:09 +00:00

Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`def po_to_text(po_path):`
			`"""Converts a po file to a text file, by stripping the msgids and all`
			`po syntax, but by keeping the kept lines at their same position /`
			`line number.`
			`"""`
Initial commit 2018-07-23 15:37:50 +00:00			`buffer = []`
Report line numbers. 2018-07-23 17:24:10 +00:00			`lines = 0`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`entries = polib.pofile(po_path)`
Initial commit 2018-07-23 15:37:50 +00:00			`for entry in entries:`
Don't spellcheck untranslated messages, they're typically in english (licence text, project names, functions prototypes, ...) 2018-07-27 08:03:21 +00:00			`if entry.msgid == entry.msgstr:`
			`continue`
Report line numbers. 2018-07-23 17:24:10 +00:00			`while lines < entry.linenum:`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`buffer.append("")`
Report line numbers. 2018-07-23 17:24:10 +00:00			`lines += 1`
Strip sphinx variables and a href links. 2018-07-27 09:01:09 +00:00			`buffer.append(clear(strip_rst(entry.msgstr)))`
Report line numbers. 2018-07-23 17:24:10 +00:00			`lines += 1`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`return "\n".join(buffer)`
Initial commit 2018-07-23 15:37:50 +00:00

			`def main():`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`"""Module entry point.`
			`"""`
Initial commit 2018-07-23 15:37:50 +00:00			`import argparse`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00
Initial commit 2018-07-23 15:37:50 +00:00			`parser = argparse.ArgumentParser(`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`description="Check spelling in po files containing restructuredText."`
			`)`
			`parser.add_argument(`
			`"-l",`
			`"--language",`
			`type=str,`
			`default="fr",`
			`help="Language to check, you'll have to install the corresponding "`
			`"hunspell dictionary, on Debian see apt list 'hunspell-*'.",`
			`)`
			`parser.add_argument(`
			`"--glob",`
			`type=str,`
			`help="Provide a glob pattern, to be interpreted by pospell, to find po files, "`
			`"like --glob '*/.po'.",`
			`)`
			`parser.add_argument(`
			`"po_file",`
			`nargs="*",`
FIX: Don't scan recursively by default, and use Path instead of strings. 2018-07-27 14:21:28 +00:00			`type=Path,`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`help="Files to check, can optionally be mixed with --glob, or not, "`
			`"use the one that fit your needs.",`
			`)`
			`parser.add_argument("--debug", action="store_true")`
			`parser.add_argument("-p", "--personal-dict", type=str)`
Initial commit 2018-07-23 15:37:50 +00:00			`args = parser.parse_args()`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`personal_dict = ["-p", args.personal_dict] if args.personal_dict else []`
Exit -1 on error. 2018-07-23 22:28:13 +00:00			`errors = 0`
Initial commit 2018-07-23 15:37:50 +00:00			`with tempfile.TemporaryDirectory() as tmpdirname:`
			`tmpdir = Path(tmpdirname)`
FIX: Don't scan recursively by default, and use Path instead of strings. 2018-07-27 14:21:28 +00:00			`for po_file in chain(`
			`Path(".").glob(args.glob) if args.glob else [], args.po_file`
			`):`
Enhance exclusions thanks to a new debug mode. 2018-07-23 22:00:52 +00:00			`if args.debug:`
			`print(po_to_text(str(po_file)))`
			`continue`
Initial commit 2018-07-23 15:37:50 +00:00			`(tmpdir / po_file.name).write_text(po_to_text(str(po_file)))`
			`output = subprocess.check_output(`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`["hunspell", "-d", args.language]`
			`+ personal_dict`
			`+ ["-u3", str(tmpdir / po_file.name)],`
			`universal_newlines=True,`
			`)`
			`for line in output.split("\n"):`
			`match = re.match(`
			`r"(?P<path>.):(?P<line>[0-9]+): Locate: (?P<error>.) \\| Try: .*$",`
			`line,`
			`)`
Report line numbers. 2018-07-23 17:24:10 +00:00			`if match:`
Exit -1 on error. 2018-07-23 22:28:13 +00:00			`errors += 1`
Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`print(`
			`match.group("path").replace(str(tmpdir), "").lstrip("/"),`
			`match.group("line"),`
			`match.group("error"),`
			`sep=":",`
			`)`
Exit -1 on error. 2018-07-23 22:28:13 +00:00			`exit(0 if errors == 0 else -1)`
Initial commit 2018-07-23 15:37:50 +00:00

Allow for multiple files to be passed positionally. 2018-07-27 13:54:10 +00:00			`if __name__ == "__main__":`
Initial commit 2018-07-23 15:37:50 +00:00			`main()`