WIP: Try without docutils.

This commit is contained in:
Julien Palard 2023-07-21 12:32:43 +02:00
parent 8b753bde26
commit 8a5004128a
Signed by: mdk
GPG Key ID: 0EFC1AC1006886F8

View File

@ -1,13 +1,11 @@
"""pospell is a spellcheckers for po files containing reStructuedText."""
import collections
import functools
import io
import logging
import multiprocessing
import os
import subprocess
import sys
from contextlib import redirect_stderr
from itertools import chain
from pathlib import Path
from shutil import which
@ -15,14 +13,10 @@ from string import digits
from typing import List, Tuple
from unicodedata import category
import docutils.frontend
import docutils.nodes
import docutils.parsers.rst
import polib
import regex
from docutils.parsers.rst import roles
from docutils.utils import new_document
from sphinxlint import rst
from sphinxlint.utils import escape2null
__version__ = "1.3"
@ -49,112 +43,54 @@ except FileNotFoundError:
sys.exit(1)
class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
"""Used to represent any unknown roles, so we can parse any rst blindly."""
def match_size(re_match):
"""Return the length of a re match."""
return re_match.end() - re_match.start()
def monkey_patch_role(role):
"""Patch docutils.parsers.rst.roles.role so it always match.
def _clean_heuristic(paragraph, pattern):
"""Remove the regex from the paragraph.
Giving a DummyNodeClass for unknown roles.
The remove starts by most "credible" ones (here lies the dragons).
To remove `(.*)` from `(abc def ghi (jkl)`, a bad move consists of
removing everything (eating a lone `(`), while the most credible
action to take is to remove `(jkl)`, leaving a lone `(`.
"""
def role_or_generic(role_name, language_module, lineno, reporter):
base_role, message = role(role_name, language_module, lineno, reporter)
if base_role is None:
roles.register_generic_role(role_name, DummyNodeClass)
base_role, message = role(role_name, language_module, lineno, reporter)
return base_role, message
return role_or_generic
while True:
candidate = min(
pattern.finditer(paragraph, overlapped=True), key=match_size, default=None
)
if candidate is None:
return paragraph
paragraph = paragraph[: candidate.start()] + paragraph[candidate.end() :]
roles.role = monkey_patch_role(roles.role)
def clean_paragraph(paragraph):
"""Removes all good constructs, so detectors can focus on bad ones.
class NodeToTextVisitor(docutils.nodes.NodeVisitor):
"""Recursively convert a docutils node to a Python string.
Usage:
>>> visitor = NodeToTextVisitor(document)
>>> document.walk(visitor)
>>> print(str(visitor))
It ignores (see IGNORE_LIST) some nodes, which we don't want in
hunspell (enphasis typically contain proper names that are unknown
to dictionaires).
It removes all well formed inline literals, inline internal
targets, and roles.
"""
IGNORE_LIST = (
"emphasis",
"superscript",
"title_reference",
"substitution_reference",
"citation_reference",
"strong",
"DummyNodeClass",
"reference",
"literal",
"Text",
"system_message",
)
def __init__(self, document):
"""Initialize visitor for the given node/document."""
self.output = []
super().__init__(document)
def unknown_visit(self, node):
"""Mandatory implementation to visit unknwon nodes."""
@staticmethod
def ignore(node):
"""Just raise SkipChildren.
Used for all visit_* in the IGNORE_LIST.
See __getattr__.
"""
raise docutils.nodes.SkipChildren
def __getattr__(self, name):
"""Skip childrens from the IGNORE_LIST."""
if name.startswith("visit_") and name[6:] in self.IGNORE_LIST:
return self.ignore
raise AttributeError(name)
def visit_Text(self, node):
"""Keep this node text, this is typically what we want to spell check."""
self.output.append(docutils.nodes.unescape(node, restore_backslashes=True))
def __str__(self):
"""Give the accumulated strings."""
return " ".join(self.output)
paragraph = escape2null(paragraph)
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\*\*", r"\*\*"))
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\*", r"\*"))
paragraph = _clean_heuristic(paragraph, rst.INLINE_LITERAL_RE)
paragraph = _clean_heuristic(paragraph, rst.INLINE_INTERNAL_TARGET_RE)
paragraph = rst.NORMAL_ROLE_RE.sub("", paragraph)
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\[", r"\]_"))
paragraph = _clean_heuristic(paragraph, rst.HYPERLINK_REFERENCES_RE)
paragraph = _clean_heuristic(paragraph, rst.ANONYMOUS_HYPERLINK_REFERENCES_RE)
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"", r"_"))
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\|", r"\|"))
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\|", r"\|_"))
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\|", r"\|__"))
return paragraph.replace("\x00", "\\")
def strip_rst(line):
"""Transform reStructuredText to plain text."""
if line.endswith("::"):
# Drop :: at the end, it would cause Literal block expected
line = line[:-2]
line = rst.NORMAL_ROLE_RE.sub("", line)
settings = docutils.frontend.get_default_settings()
settings.pep_references = None
settings.rfc_references = None
settings.pep_base_url = "http://www.python.org/dev/peps/"
settings.pep_file_url_template = "pep-%04d"
parser = docutils.parsers.rst.Parser()
stderr_stringio = io.StringIO()
with redirect_stderr(stderr_stringio):
document = new_document("<rst-doc>", settings=settings)
parser.parse(line, document)
stderr = stderr_stringio.getvalue()
if stderr:
print(stderr.strip(), "while parsing:", line)
visitor = NodeToTextVisitor(document)
document.walk(visitor)
return str(visitor)
return clean_paragraph(line)
def clear(line, drop_capitalized=False, po_path=""):