Add a string tokenizer helper function.

In order to efficiently lookup stations from user's input this function
might be used to implement some sort of Full-text search.
This commit is contained in:
Barbagus42 2023-11-11 08:37:40 +01:00
parent 0d61228d8d
commit 2c520910a2
1 changed files with 29 additions and 0 deletions

View File

@ -1,3 +1,5 @@
import re
from typing import Any
from collections.abc import Sequence, Iterable
@ -30,3 +32,30 @@ class Database:
database = Database(settings.SQLITE_URI)
_TOKENIZE_TRANSLATE_TABLE = (
"------------------------------------------------0123456789------"
"-abcdefghijklmnopqrstuvwxyz------abcdefghijklmnopqrstuvwxyz-----"
"----------------------------------------------------------------"
"aaaaaa_ceeeeiiii_nooooo-ouuuuy__aaaaaa_ceeeeiiii_nooooo-ouuuuy_y"
)
def tokenize(text: str):
"""Break a string into ASCII tokens (removing diacritic).
Return a list of token, position pairs.
"""
if any(ord(c) > 255 for c in text):
raise ValueError(text)
text = text.translate(_TOKENIZE_TRANSLATE_TABLE)
return [
(
m.group(0),
m.start(0),
)
for m in re.finditer(r"\w+", text)
]