WIP: Try without docutils.
This commit is contained in:
parent
8b753bde26
commit
8a5004128a
138
pospell.py
138
pospell.py
|
@ -1,13 +1,11 @@
|
|||
"""pospell is a spellcheckers for po files containing reStructuedText."""
|
||||
import collections
|
||||
import functools
|
||||
import io
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from contextlib import redirect_stderr
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from shutil import which
|
||||
|
@ -15,14 +13,10 @@ from string import digits
|
|||
from typing import List, Tuple
|
||||
from unicodedata import category
|
||||
|
||||
import docutils.frontend
|
||||
import docutils.nodes
|
||||
import docutils.parsers.rst
|
||||
import polib
|
||||
import regex
|
||||
from docutils.parsers.rst import roles
|
||||
from docutils.utils import new_document
|
||||
from sphinxlint import rst
|
||||
from sphinxlint.utils import escape2null
|
||||
|
||||
__version__ = "1.3"
|
||||
|
||||
|
@ -49,112 +43,54 @@ except FileNotFoundError:
|
|||
sys.exit(1)
|
||||
|
||||
|
||||
class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement):
|
||||
"""Used to represent any unknown roles, so we can parse any rst blindly."""
|
||||
def match_size(re_match):
|
||||
"""Return the length of a re match."""
|
||||
return re_match.end() - re_match.start()
|
||||
|
||||
|
||||
def monkey_patch_role(role):
|
||||
"""Patch docutils.parsers.rst.roles.role so it always match.
|
||||
def _clean_heuristic(paragraph, pattern):
|
||||
"""Remove the regex from the paragraph.
|
||||
|
||||
Giving a DummyNodeClass for unknown roles.
|
||||
The remove starts by most "credible" ones (here lies the dragons).
|
||||
|
||||
To remove `(.*)` from `(abc def ghi (jkl)`, a bad move consists of
|
||||
removing everything (eating a lone `(`), while the most credible
|
||||
action to take is to remove `(jkl)`, leaving a lone `(`.
|
||||
"""
|
||||
|
||||
def role_or_generic(role_name, language_module, lineno, reporter):
|
||||
base_role, message = role(role_name, language_module, lineno, reporter)
|
||||
if base_role is None:
|
||||
roles.register_generic_role(role_name, DummyNodeClass)
|
||||
base_role, message = role(role_name, language_module, lineno, reporter)
|
||||
return base_role, message
|
||||
|
||||
return role_or_generic
|
||||
while True:
|
||||
candidate = min(
|
||||
pattern.finditer(paragraph, overlapped=True), key=match_size, default=None
|
||||
)
|
||||
if candidate is None:
|
||||
return paragraph
|
||||
paragraph = paragraph[: candidate.start()] + paragraph[candidate.end() :]
|
||||
|
||||
|
||||
roles.role = monkey_patch_role(roles.role)
|
||||
def clean_paragraph(paragraph):
|
||||
"""Removes all good constructs, so detectors can focus on bad ones.
|
||||
|
||||
|
||||
class NodeToTextVisitor(docutils.nodes.NodeVisitor):
|
||||
"""Recursively convert a docutils node to a Python string.
|
||||
|
||||
Usage:
|
||||
|
||||
>>> visitor = NodeToTextVisitor(document)
|
||||
>>> document.walk(visitor)
|
||||
>>> print(str(visitor))
|
||||
|
||||
It ignores (see IGNORE_LIST) some nodes, which we don't want in
|
||||
hunspell (enphasis typically contain proper names that are unknown
|
||||
to dictionaires).
|
||||
It removes all well formed inline literals, inline internal
|
||||
targets, and roles.
|
||||
"""
|
||||
|
||||
IGNORE_LIST = (
|
||||
"emphasis",
|
||||
"superscript",
|
||||
"title_reference",
|
||||
"substitution_reference",
|
||||
"citation_reference",
|
||||
"strong",
|
||||
"DummyNodeClass",
|
||||
"reference",
|
||||
"literal",
|
||||
"Text",
|
||||
"system_message",
|
||||
)
|
||||
|
||||
def __init__(self, document):
|
||||
"""Initialize visitor for the given node/document."""
|
||||
self.output = []
|
||||
super().__init__(document)
|
||||
|
||||
def unknown_visit(self, node):
|
||||
"""Mandatory implementation to visit unknwon nodes."""
|
||||
|
||||
@staticmethod
|
||||
def ignore(node):
|
||||
"""Just raise SkipChildren.
|
||||
|
||||
Used for all visit_* in the IGNORE_LIST.
|
||||
|
||||
See __getattr__.
|
||||
"""
|
||||
raise docutils.nodes.SkipChildren
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Skip childrens from the IGNORE_LIST."""
|
||||
if name.startswith("visit_") and name[6:] in self.IGNORE_LIST:
|
||||
return self.ignore
|
||||
raise AttributeError(name)
|
||||
|
||||
def visit_Text(self, node):
|
||||
"""Keep this node text, this is typically what we want to spell check."""
|
||||
self.output.append(docutils.nodes.unescape(node, restore_backslashes=True))
|
||||
|
||||
def __str__(self):
|
||||
"""Give the accumulated strings."""
|
||||
return " ".join(self.output)
|
||||
paragraph = escape2null(paragraph)
|
||||
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\*\*", r"\*\*"))
|
||||
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\*", r"\*"))
|
||||
paragraph = _clean_heuristic(paragraph, rst.INLINE_LITERAL_RE)
|
||||
paragraph = _clean_heuristic(paragraph, rst.INLINE_INTERNAL_TARGET_RE)
|
||||
paragraph = rst.NORMAL_ROLE_RE.sub("", paragraph)
|
||||
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\[", r"\]_"))
|
||||
paragraph = _clean_heuristic(paragraph, rst.HYPERLINK_REFERENCES_RE)
|
||||
paragraph = _clean_heuristic(paragraph, rst.ANONYMOUS_HYPERLINK_REFERENCES_RE)
|
||||
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"", r"_"))
|
||||
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\|", r"\|"))
|
||||
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\|", r"\|_"))
|
||||
paragraph = _clean_heuristic(paragraph, rst.inline_markup_gen(r"\|", r"\|__"))
|
||||
return paragraph.replace("\x00", "\\")
|
||||
|
||||
|
||||
def strip_rst(line):
|
||||
"""Transform reStructuredText to plain text."""
|
||||
if line.endswith("::"):
|
||||
# Drop :: at the end, it would cause Literal block expected
|
||||
line = line[:-2]
|
||||
line = rst.NORMAL_ROLE_RE.sub("", line)
|
||||
settings = docutils.frontend.get_default_settings()
|
||||
settings.pep_references = None
|
||||
settings.rfc_references = None
|
||||
settings.pep_base_url = "http://www.python.org/dev/peps/"
|
||||
settings.pep_file_url_template = "pep-%04d"
|
||||
parser = docutils.parsers.rst.Parser()
|
||||
stderr_stringio = io.StringIO()
|
||||
with redirect_stderr(stderr_stringio):
|
||||
document = new_document("<rst-doc>", settings=settings)
|
||||
parser.parse(line, document)
|
||||
stderr = stderr_stringio.getvalue()
|
||||
if stderr:
|
||||
print(stderr.strip(), "while parsing:", line)
|
||||
visitor = NodeToTextVisitor(document)
|
||||
document.walk(visitor)
|
||||
return str(visitor)
|
||||
return clean_paragraph(line)
|
||||
|
||||
|
||||
def clear(line, drop_capitalized=False, po_path=""):
|
||||
|
|
Loading…
Reference in New Issue
Block a user