Add vocabulary correction pipeline and example config
This commit is contained in:
parent
f9224621fa
commit
c3503fbbde
9 changed files with 865 additions and 23 deletions
|
|
@ -24,6 +24,9 @@ SYSTEM_PROMPT = (
|
|||
"- Remove filler words (um/uh/like)\n"
|
||||
"- Remove false starts\n"
|
||||
"- Remove self-corrections.\n"
|
||||
"- If a <dictionary> section exists, apply only the listed corrections.\n"
|
||||
"- Keep dictionary spellings exactly as provided.\n"
|
||||
"- Treat domain hints as advisory only; never invent context-specific jargon.\n"
|
||||
"- Output ONLY the cleaned text, no commentary.\n\n"
|
||||
"Examples:\n"
|
||||
" - \"Hey, schedule that for 5 PM, I mean 4 PM\" -> \"Hey, schedule that for 4 PM\"\n"
|
||||
|
|
@ -49,9 +52,23 @@ class LlamaProcessor:
|
|||
verbose=verbose,
|
||||
)
|
||||
|
||||
def process(self, text: str, lang: str = "en") -> str:
|
||||
user_content = f"<transcript>{text}</transcript>"
|
||||
user_content = f"<language>{lang}</language>\n{user_content}"
|
||||
def process(
|
||||
self,
|
||||
text: str,
|
||||
lang: str = "en",
|
||||
*,
|
||||
dictionary_context: str = "",
|
||||
domain_name: str = "general",
|
||||
domain_confidence: float = 0.0,
|
||||
) -> str:
|
||||
blocks = [
|
||||
f"<language>{lang}</language>",
|
||||
f'<domain name="{domain_name}" confidence="{domain_confidence:.2f}"/>',
|
||||
]
|
||||
if dictionary_context.strip():
|
||||
blocks.append(f"<dictionary>\n{dictionary_context.strip()}\n</dictionary>")
|
||||
blocks.append(f"<transcript>{text}</transcript>")
|
||||
user_content = "\n".join(blocks)
|
||||
response = self.client.create_chat_completion(
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue