Add vocabulary correction pipeline and example config

2026-02-25 10:03:32 -03:00 · 2026-02-25 10:03:32 -03:00 · c3503fbbde
commit c3503fbbde
parent f9224621fa
9 changed files with 865 additions and 23 deletions
--- a/src/aiprocess.py
+++ b/src/aiprocess.py
@ -24,6 +24,9 @@ SYSTEM_PROMPT = (
    "- Remove filler words (um/uh/like)\n"
    "- Remove false starts\n"
    "- Remove self-corrections.\n"
+    "- If a <dictionary> section exists, apply only the listed corrections.\n"
+    "- Keep dictionary spellings exactly as provided.\n"
+    "- Treat domain hints as advisory only; never invent context-specific jargon.\n"
    "- Output ONLY the cleaned text, no commentary.\n\n"
    "Examples:\n"
    "   - \"Hey, schedule that for 5 PM, I mean 4 PM\" -> \"Hey, schedule that for 4 PM\"\n"
@ -49,9 +52,23 @@ class LlamaProcessor:
            verbose=verbose,
        )

-    def process(self, text: str, lang: str = "en") -> str:
-        user_content = f"<transcript>{text}</transcript>"
-        user_content = f"<language>{lang}</language>\n{user_content}"
+    def process(
+        self,
+        text: str,
+        lang: str = "en",
+        *,
+        dictionary_context: str = "",
+        domain_name: str = "general",
+        domain_confidence: float = 0.0,
+    ) -> str:
+        blocks = [
+            f"<language>{lang}</language>",
+            f'<domain name="{domain_name}" confidence="{domain_confidence:.2f}"/>',
+        ]
+        if dictionary_context.strip():
+            blocks.append(f"<dictionary>\n{dictionary_context.strip()}\n</dictionary>")
+        blocks.append(f"<transcript>{text}</transcript>")
+        user_content = "\n".join(blocks)
        response = self.client.create_chat_completion(
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},