Add a string tokenizer helper function.
In order to efficiently lookup stations from user's input this function might be used to implement some sort of Full-text search.
This commit is contained in:
parent
0d61228d8d
commit
2c520910a2
|
@ -1,3 +1,5 @@
|
||||||
|
import re
|
||||||
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from collections.abc import Sequence, Iterable
|
from collections.abc import Sequence, Iterable
|
||||||
|
|
||||||
|
@ -30,3 +32,30 @@ class Database:
|
||||||
|
|
||||||
|
|
||||||
database = Database(settings.SQLITE_URI)
|
database = Database(settings.SQLITE_URI)
|
||||||
|
|
||||||
|
|
||||||
|
_TOKENIZE_TRANSLATE_TABLE = (
|
||||||
|
"------------------------------------------------0123456789------"
|
||||||
|
"-abcdefghijklmnopqrstuvwxyz------abcdefghijklmnopqrstuvwxyz-----"
|
||||||
|
"----------------------------------------------------------------"
|
||||||
|
"aaaaaa_ceeeeiiii_nooooo-ouuuuy__aaaaaa_ceeeeiiii_nooooo-ouuuuy_y"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(text: str):
|
||||||
|
"""Break a string into ASCII tokens (removing diacritic).
|
||||||
|
|
||||||
|
Return a list of token, position pairs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if any(ord(c) > 255 for c in text):
|
||||||
|
raise ValueError(text)
|
||||||
|
|
||||||
|
text = text.translate(_TOKENIZE_TRANSLATE_TABLE)
|
||||||
|
return [
|
||||||
|
(
|
||||||
|
m.group(0),
|
||||||
|
m.start(0),
|
||||||
|
)
|
||||||
|
for m in re.finditer(r"\w+", text)
|
||||||
|
]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user