aman/tests/test_vocabulary.py

54 lines
1.6 KiB
Python

import sys
import unittest
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
from config import VocabularyConfig, VocabularyReplacement
from vocabulary import VocabularyEngine
class VocabularyEngineTests(unittest.TestCase):
def _engine(self, replacements=None, terms=None):
vocab = VocabularyConfig(
replacements=replacements or [],
terms=terms or [],
)
return VocabularyEngine(vocab)
def test_boundary_aware_replacement(self):
engine = self._engine(
replacements=[VocabularyReplacement(source="Martha", target="Marta")],
)
text = "Martha met Marthaville and Martha."
out = engine.apply_deterministic_replacements(text)
self.assertEqual(out, "Marta met Marthaville and Marta.")
def test_longest_match_replacement_wins(self):
engine = self._engine(
replacements=[
VocabularyReplacement(source="new york", target="NYC"),
VocabularyReplacement(source="york", target="Yorkshire"),
],
)
out = engine.apply_deterministic_replacements("new york york")
self.assertEqual(out, "NYC Yorkshire")
def test_stt_hints_are_bounded(self):
terms = [f"term{i}" for i in range(300)]
engine = self._engine(terms=terms)
hotwords, prompt = engine.build_stt_hints()
self.assertLessEqual(len(hotwords), 1024)
self.assertLessEqual(len(prompt), 600)
if __name__ == "__main__":
unittest.main()