62 lines
1.5 KiB
Python
62 lines
1.5 KiB
Python
import re
|
|
|
|
from typing import Any
|
|
from collections.abc import Sequence, Iterable
|
|
|
|
import aiosqlite
|
|
|
|
from ter.config import Settings
|
|
|
|
settings = Settings()
|
|
|
|
|
|
class Database:
|
|
Params = Sequence[Any] | dict[str, Any]
|
|
|
|
def __init__(self, uri: str) -> None:
|
|
self._uri: str = uri
|
|
self._connection: aiosqlite.Connection | None = None
|
|
|
|
async def connect(self):
|
|
self._connection = await aiosqlite.connect(self._uri, uri=True)
|
|
|
|
async def disconnect(self):
|
|
await self._connection.close()
|
|
self._connection = None
|
|
|
|
async def execute(self, sql: str, params: Params = ()):
|
|
return await self._connection.execute(sql, params)
|
|
|
|
async def executemany(self, sql: str, params: Iterable[Params]):
|
|
return await self._connection.executemany(sql, params)
|
|
|
|
|
|
database = Database(settings.SQLITE_URI)
|
|
|
|
|
|
_TOKENIZE_TRANSLATE_TABLE = (
|
|
"------------------------------------------------0123456789------"
|
|
"-abcdefghijklmnopqrstuvwxyz------abcdefghijklmnopqrstuvwxyz-----"
|
|
"----------------------------------------------------------------"
|
|
"aaaaaa_ceeeeiiii_nooooo-ouuuuy__aaaaaa_ceeeeiiii_nooooo-ouuuuy_y"
|
|
)
|
|
|
|
|
|
def tokenize(text: str):
|
|
"""Break a string into ASCII tokens (removing diacritic).
|
|
|
|
Return a list of token, position pairs.
|
|
"""
|
|
|
|
if any(ord(c) > 255 for c in text):
|
|
raise ValueError(text)
|
|
|
|
text = text.translate(_TOKENIZE_TRANSLATE_TABLE)
|
|
return [
|
|
(
|
|
m.group(0),
|
|
m.start(0),
|
|
)
|
|
for m in re.finditer(r"\w+", text)
|
|
]
|