Add a string tokenizer helper function.

In order to efficiently lookup stations from user's input this function might be used to implement some sort of Full-text search.
2023-11-11 08:37:40 +01:00 · 2023-11-11 08:37:40 +01:00 · 2c520910a2
parent 0d61228d8d
commit 2c520910a2
1 changed files with 29 additions and 0 deletions
--- a/ter/helpers.py
+++ b/ter/helpers.py
@ -1,3 +1,5 @@
+import re
+
 from typing import Any
 from collections.abc import Sequence, Iterable

@ -30,3 +32,30 @@ class Database:


 database = Database(settings.SQLITE_URI)
+
+
+_TOKENIZE_TRANSLATE_TABLE = (
+    "------------------------------------------------0123456789------"
+    "-abcdefghijklmnopqrstuvwxyz------abcdefghijklmnopqrstuvwxyz-----"
+    "----------------------------------------------------------------"
+    "aaaaaa_ceeeeiiii_nooooo-ouuuuy__aaaaaa_ceeeeiiii_nooooo-ouuuuy_y"
+)
+
+
+def tokenize(text: str):
+    """Break a string into ASCII tokens (removing diacritic).
+
+    Return a list of token, position pairs.
+    """
+
+    if any(ord(c) > 255 for c in text):
+        raise ValueError(text)
+
+    text = text.translate(_TOKENIZE_TRANSLATE_TABLE)
+    return [
+        (
+            m.group(0),
+            m.start(0),
+        )
+        for m in re.finditer(r"\w+", text)
+    ]