Add a string tokenizer helper function.
In order to efficiently lookup stations from user's input this function might be used to implement some sort of Full-text search.
This commit is contained in:
parent
0d61228d8d
commit
2c520910a2
|
@ -1,3 +1,5 @@
|
|||
import re
|
||||
|
||||
from typing import Any
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
@ -30,3 +32,30 @@ class Database:
|
|||
|
||||
|
||||
database = Database(settings.SQLITE_URI)
|
||||
|
||||
|
||||
_TOKENIZE_TRANSLATE_TABLE = (
|
||||
"------------------------------------------------0123456789------"
|
||||
"-abcdefghijklmnopqrstuvwxyz------abcdefghijklmnopqrstuvwxyz-----"
|
||||
"----------------------------------------------------------------"
|
||||
"aaaaaa_ceeeeiiii_nooooo-ouuuuy__aaaaaa_ceeeeiiii_nooooo-ouuuuy_y"
|
||||
)
|
||||
|
||||
|
||||
def tokenize(text: str):
|
||||
"""Break a string into ASCII tokens (removing diacritic).
|
||||
|
||||
Return a list of token, position pairs.
|
||||
"""
|
||||
|
||||
if any(ord(c) > 255 for c in text):
|
||||
raise ValueError(text)
|
||||
|
||||
text = text.translate(_TOKENIZE_TRANSLATE_TABLE)
|
||||
return [
|
||||
(
|
||||
m.group(0),
|
||||
m.start(0),
|
||||
)
|
||||
for m in re.finditer(r"\w+", text)
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue