WIP: Try without docutils.

2023-07-21 12:32:43 +02:00 · 2023-07-21 12:32:43 +02:00 · 8a5004128a
commit 8a5004128a
parent 8b753bde26
1 changed files with 37 additions and 101 deletions
--- a/pospell.py
+++ b/pospell.py
@ -1,13 +1,11 @@
 """pospell is a spellcheckers for po files containing reStructuedText."""
 import collections
 import functools
-import io
 import logging
 import multiprocessing
 import os
 import subprocess
 import sys
-from contextlib import redirect_stderr
 from itertools import chain
 from pathlib import Path
 from shutil import which
@ -15,14 +13,10 @@ from string import digits
 from typing import List, Tuple
 from unicodedata import category

-import docutils.frontend
-import docutils.nodes
-import docutils.parsers.rst
 import polib
 import regex
-from docutils.parsers.rst import roles
-from docutils.utils import new_document
 from sphinxlint import rst
+from sphinxlint.utils import escape2null

 __version__ = "1.3"

@ -49,112 +43,54 @@ except FileNotFoundError:
    sys.exit(1)


-class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
-    """Used to represent any unknown roles, so we can parse any rst blindly."""
+def match_size(re_match):
+    """Return the length of a re match."""
+    return re_match.end() - re_match.start()


-def monkey_patch_role(role):
-    """Patch docutils.parsers.rst.roles.role so it always match.
+def _clean_heuristic(paragraph, pattern):
+    """Remove the regex from the paragraph.

-    Giving a DummyNodeClass for unknown roles.
+    The remove starts by most "credible" ones (here lies the dragons).
+
+    To remove `(.*)` from `(abc def ghi (jkl)`, a bad move consists of
+    removing everything (eating a lone `(`), while the most credible
+    action to take is to remove `(jkl)`, leaving a lone `(`.
    """
-
-    def role_or_generic(role_name, language_module, lineno, reporter):
-        base_role, message = role(role_name, language_module, lineno, reporter)
-        if base_role is None:
-            roles.register_generic_role(role_name, DummyNodeClass)
-            base_role, message = role(role_name, language_module, lineno, reporter)
-        return base_role, message
-
-    return role_or_generic
+    while True:
+        candidate = min(
+            pattern.finditer(paragraph, overlapped=True), key=match_size, default=None
+        )
+        if candidate is None:
+            return paragraph
+        paragraph = paragraph[: candidate.start()] + paragraph[candidate.end() :]


-roles.role = monkey_patch_role(roles.role)
+def clean_paragraph(paragraph):
+    """Removes all good constructs, so detectors can focus on bad ones.

-
-class NodeToTextVisitor(docutils.nodes.NodeVisitor):
-    """Recursively convert a docutils node to a Python string.
-
-    Usage:
-
-    >>> visitor = NodeToTextVisitor(document)
-    >>> document.walk(visitor)
-    >>> print(str(visitor))
-
-    It ignores (see IGNORE_LIST) some nodes, which we don't want in
-    hunspell (enphasis typically contain proper names that are unknown
-    to dictionaires).
+    It removes all well formed inline literals, inline internal
+    targets, and roles.
    """
-
-    IGNORE_LIST = (
-        "emphasis",
-        "superscript",
-        "title_reference",
-        "substitution_reference",
-        "citation_reference",
-        "strong",
-        "DummyNodeClass",
-        "reference",
-        "literal",
-        "Text",
-        "system_message",
-    )
-
-    def __init__(self, document):
-        """Initialize visitor for the given node/document."""
-        self.output = []
-        super().__init__(document)
-
-    def unknown_visit(self, node):
-        """Mandatory implementation to visit unknwon nodes."""
-
-    @staticmethod
-    def ignore(node):
-        """Just raise SkipChildren.
-
-        Used for all visit_* in the IGNORE_LIST.
-
-        See __getattr__.
-        """
-        raise docutils.nodes.SkipChildren
-
-    def __getattr__(self, name):
-        """Skip childrens from the IGNORE_LIST."""
-        if name.startswith("visit_") and name[6:] in self.IGNORE_LIST:
-            return self.ignore
-        raise AttributeError(name)
-
-    def visit_Text(self, node):
-        """Keep this node text, this is typically what we want to spell check."""
-        self.output.append(docutils.nodes.unescape(node, restore_backslashes=True))
-
-    def __str__(self):
-        """Give the accumulated strings."""
-        return " ".join(self.output)
+    paragraph = escape2null(paragraph)
+    paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\*\*", r"\*\*"))
+    paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\*", r"\*"))
+    paragraph = _clean_heuristic(paragraph, rst.INLINE_LITERAL_RE)
+    paragraph = _clean_heuristic(paragraph, rst.INLINE_INTERNAL_TARGET_RE)
+    paragraph = rst.NORMAL_ROLE_RE.sub("", paragraph)
+    paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\[", r"\]_"))
+    paragraph = _clean_heuristic(paragraph, rst.HYPERLINK_REFERENCES_RE)
+    paragraph = _clean_heuristic(paragraph, rst.ANONYMOUS_HYPERLINK_REFERENCES_RE)
+    paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"", r"_"))
+    paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\|", r"\|"))
+    paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\|", r"\|_"))
+    paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\|", r"\|__"))
+    return paragraph.replace("\x00", "\\")


 def strip_rst(line):
    """Transform reStructuredText to plain text."""
-    if line.endswith("::"):
-        # Drop :: at the end, it would cause Literal block expected
-        line = line[:-2]
-    line = rst.NORMAL_ROLE_RE.sub("", line)
-    settings = docutils.frontend.get_default_settings()
-    settings.pep_references = None
-    settings.rfc_references = None
-    settings.pep_base_url = "http://www.python.org/dev/peps/"
-    settings.pep_file_url_template = "pep-%04d"
-    parser = docutils.parsers.rst.Parser()
-    stderr_stringio = io.StringIO()
-    with redirect_stderr(stderr_stringio):
-        document = new_document("<rst-doc>", settings=settings)
-        parser.parse(line, document)
-    stderr = stderr_stringio.getvalue()
-    if stderr:
-        print(stderr.strip(), "while parsing:", line)
-    visitor = NodeToTextVisitor(document)
-    document.walk(visitor)
-    return str(visitor)
+    return clean_paragraph(line)


 def clear(line, drop_capitalized=False, po_path=""):