TER/ter/helpers.py

62 lines
1.5 KiB
Python

import re
from typing import Any
from collections.abc import Sequence, Iterable
import aiosqlite
from ter.config import Settings
settings = Settings()
class Database:
Params = Sequence[Any] | dict[str, Any]
def __init__(self, uri: str) -> None:
self._uri: str = uri
self._connection: aiosqlite.Connection | None = None
async def connect(self):
self._connection = await aiosqlite.connect(self._uri, uri=True)
async def disconnect(self):
await self._connection.close()
self._connection = None
async def execute(self, sql: str, params: Params = ()):
return await self._connection.execute(sql, params)
async def executemany(self, sql: str, params: Iterable[Params]):
return await self._connection.executemany(sql, params)
database = Database(settings.SQLITE_URI)
_TOKENIZE_TRANSLATE_TABLE = (
"------------------------------------------------0123456789------"
"-abcdefghijklmnopqrstuvwxyz------abcdefghijklmnopqrstuvwxyz-----"
"----------------------------------------------------------------"
"aaaaaa_ceeeeiiii_nooooo-ouuuuy__aaaaaa_ceeeeiiii_nooooo-ouuuuy_y"
)
def tokenize(text: str):
"""Break a string into ASCII tokens (removing diacritic).
Return a list of token, position pairs.
"""
if any(ord(c) > 255 for c in text):
raise ValueError(text)
text = text.translate(_TOKENIZE_TRANSLATE_TABLE)
return [
(
m.group(0),
m.start(0),
)
for m in re.finditer(r"\w+", text)
]