Add benchmark-driven model promotion workflow and pipeline stages
Some checks failed
ci / test-and-build (push) Has been cancelled

This commit is contained in:
Thales Maciel 2026-02-28 15:12:33 -03:00
parent 98b13d1069
commit 8c1f7c1e13
38 changed files with 5300 additions and 503 deletions

View file

@ -23,9 +23,8 @@ class VocabularyEngine:
}
self._replacement_pattern = _build_replacement_pattern(rule.source for rule in self._replacements)
# Keep hint payload bounded so model prompts do not balloon.
self._stt_hotwords = self._build_stt_hotwords(limit=128, char_budget=1024)
self._stt_initial_prompt = self._build_stt_initial_prompt(char_budget=600)
# Keep ASR hint payload tiny so Whisper remains high-recall and minimally biased.
self._stt_hotwords = self._build_stt_hotwords(limit=64, char_budget=480)
def has_dictionary(self) -> bool:
return bool(self._replacements or self._terms)
@ -42,7 +41,7 @@ class VocabularyEngine:
return self._replacement_pattern.sub(_replace, text)
def build_stt_hints(self) -> tuple[str, str]:
return self._stt_hotwords, self._stt_initial_prompt
return self._stt_hotwords, ""
def build_ai_dictionary_context(self, max_lines: int = 80, char_budget: int = 1500) -> str:
lines: list[str] = []
@ -82,16 +81,6 @@ class VocabularyEngine:
used += addition
return ", ".join(words)
def _build_stt_initial_prompt(self, *, char_budget: int) -> str:
if not self._stt_hotwords:
return ""
prefix = "Preferred vocabulary: "
available = max(char_budget - len(prefix), 0)
hotwords = self._stt_hotwords[:available].rstrip(", ")
if not hotwords:
return ""
return prefix + hotwords
def _build_replacement_pattern(sources: Iterable[str]) -> re.Pattern[str] | None:
unique_sources = _dedupe_preserve_order(list(sources))