Add vocabulary correction pipeline and example config

This commit is contained in:
Thales Maciel 2026-02-25 10:03:32 -03:00
parent f9224621fa
commit c3503fbbde
9 changed files with 865 additions and 23 deletions

View file

@ -24,6 +24,9 @@ SYSTEM_PROMPT = (
"- Remove filler words (um/uh/like)\n"
"- Remove false starts\n"
"- Remove self-corrections.\n"
"- If a <dictionary> section exists, apply only the listed corrections.\n"
"- Keep dictionary spellings exactly as provided.\n"
"- Treat domain hints as advisory only; never invent context-specific jargon.\n"
"- Output ONLY the cleaned text, no commentary.\n\n"
"Examples:\n"
" - \"Hey, schedule that for 5 PM, I mean 4 PM\" -> \"Hey, schedule that for 4 PM\"\n"
@ -49,9 +52,23 @@ class LlamaProcessor:
verbose=verbose,
)
def process(self, text: str, lang: str = "en") -> str:
user_content = f"<transcript>{text}</transcript>"
user_content = f"<language>{lang}</language>\n{user_content}"
def process(
self,
text: str,
lang: str = "en",
*,
dictionary_context: str = "",
domain_name: str = "general",
domain_confidence: float = 0.0,
) -> str:
blocks = [
f"<language>{lang}</language>",
f'<domain name="{domain_name}" confidence="{domain_confidence:.2f}"/>',
]
if dictionary_context.strip():
blocks.append(f"<dictionary>\n{dictionary_context.strip()}\n</dictionary>")
blocks.append(f"<transcript>{text}</transcript>")
user_content = "\n".join(blocks)
response = self.client.create_chat_completion(
messages=[
{"role": "system", "content": SYSTEM_PROMPT},