Revert "Add pipeline engine and remove legacy compatibility paths"

2026-02-26 12:54:47 -03:00 · 2026-02-26 12:54:47 -03:00 · 5b38cc7dcd
commit 5b38cc7dcd
parent e221d49020
18 changed files with 399 additions and 1523 deletions
--- a/src/vocabulary.py
+++ b/src/vocabulary.py
@ -4,7 +4,101 @@ import re
 from dataclasses import dataclass
 from typing import Iterable

-from config import VocabularyConfig
+from config import DomainInferenceConfig, VocabularyConfig
+
+
+DOMAIN_GENERAL = "general"
+DOMAIN_PERSONAL_NAMES = "personal_names"
+DOMAIN_SOFTWARE_DEV = "software_dev"
+DOMAIN_OPS_INFRA = "ops_infra"
+DOMAIN_BUSINESS = "business"
+DOMAIN_MEDICAL_LEGAL = "medical_legal"
+
+DOMAIN_ORDER = (
+    DOMAIN_PERSONAL_NAMES,
+    DOMAIN_SOFTWARE_DEV,
+    DOMAIN_OPS_INFRA,
+    DOMAIN_BUSINESS,
+    DOMAIN_MEDICAL_LEGAL,
+)
+
+DOMAIN_KEYWORDS = {
+    DOMAIN_SOFTWARE_DEV: {
+        "api",
+        "bug",
+        "code",
+        "commit",
+        "docker",
+        "function",
+        "git",
+        "github",
+        "javascript",
+        "python",
+        "refactor",
+        "repository",
+        "typescript",
+        "unit",
+        "test",
+    },
+    DOMAIN_OPS_INFRA: {
+        "cluster",
+        "container",
+        "deploy",
+        "deployment",
+        "incident",
+        "kubernetes",
+        "monitoring",
+        "nginx",
+        "pod",
+        "prod",
+        "service",
+        "systemd",
+        "terraform",
+    },
+    DOMAIN_BUSINESS: {
+        "budget",
+        "client",
+        "deadline",
+        "finance",
+        "invoice",
+        "meeting",
+        "milestone",
+        "project",
+        "quarter",
+        "roadmap",
+        "sales",
+        "stakeholder",
+    },
+    DOMAIN_MEDICAL_LEGAL: {
+        "agreement",
+        "case",
+        "claim",
+        "compliance",
+        "contract",
+        "diagnosis",
+        "liability",
+        "patient",
+        "prescription",
+        "regulation",
+        "symptom",
+        "treatment",
+    },
+}
+
+DOMAIN_PHRASES = {
+    DOMAIN_SOFTWARE_DEV: ("pull request", "code review", "integration test"),
+    DOMAIN_OPS_INFRA: ("on call", "service restart", "roll back"),
+    DOMAIN_BUSINESS: ("follow up", "action items", "meeting notes"),
+    DOMAIN_MEDICAL_LEGAL: ("terms and conditions", "medical record", "legal review"),
+}
+
+GREETING_TOKENS = {"hello", "hi", "hey", "good morning", "good afternoon", "good evening"}
+
+
+@dataclass(frozen=True)
+class DomainResult:
+    name: str
+    confidence: float


@dataclass(frozen=True)
@ -14,9 +108,10 @@ class _ReplacementView:


 class VocabularyEngine:
-    def __init__(self, vocab_cfg: VocabularyConfig):
+    def __init__(self, vocab_cfg: VocabularyConfig, domain_cfg: DomainInferenceConfig):
        self._replacements = [_ReplacementView(r.source, r.target) for r in vocab_cfg.replacements]
        self._terms = list(vocab_cfg.terms)
+        self._domain_enabled = bool(domain_cfg.enabled)

        self._replacement_map = {
            _normalize_key(rule.source): rule.target for rule in self._replacements
@ -66,6 +161,55 @@ class VocabularyEngine:
            used += addition
        return "\n".join(out)

+    def infer_domain(self, text: str) -> DomainResult:
+        if not self._domain_enabled:
+            return DomainResult(name=DOMAIN_GENERAL, confidence=0.0)
+
+        normalized = text.casefold()
+        tokens = re.findall(r"[a-z0-9+#./_-]+", normalized)
+        if not tokens:
+            return DomainResult(name=DOMAIN_GENERAL, confidence=0.0)
+
+        scores = {domain: 0 for domain in DOMAIN_ORDER}
+        for token in tokens:
+            for domain, keywords in DOMAIN_KEYWORDS.items():
+                if token in keywords:
+                    scores[domain] += 2
+
+        for domain, phrases in DOMAIN_PHRASES.items():
+            for phrase in phrases:
+                if phrase in normalized:
+                    scores[domain] += 2
+
+        if any(token in GREETING_TOKENS for token in tokens):
+            scores[DOMAIN_PERSONAL_NAMES] += 1
+
+        # Boost domains from configured dictionary terms and replacement targets.
+        dictionary_tokens = self._dictionary_tokens()
+        for token in dictionary_tokens:
+            for domain, keywords in DOMAIN_KEYWORDS.items():
+                if token in keywords and token in tokens:
+                    scores[domain] += 1
+
+        top_domain = DOMAIN_GENERAL
+        top_score = 0
+        total_score = 0
+        for domain in DOMAIN_ORDER:
+            score = scores[domain]
+            total_score += score
+            if score > top_score:
+                top_score = score
+                top_domain = domain
+
+        if top_score < 2 or total_score == 0:
+            return DomainResult(name=DOMAIN_GENERAL, confidence=0.0)
+
+        confidence = top_score / total_score
+        if confidence < 0.45:
+            return DomainResult(name=DOMAIN_GENERAL, confidence=0.0)
+
+        return DomainResult(name=top_domain, confidence=round(confidence, 2))
+
    def _build_stt_hotwords(self, *, limit: int, char_budget: int) -> str:
        items = _dedupe_preserve_order(
            [rule.target for rule in self._replacements] + self._terms
@ -92,6 +236,19 @@ class VocabularyEngine:
            return ""
        return prefix + hotwords

+    def _dictionary_tokens(self) -> set[str]:
+        values: list[str] = []
+        for rule in self._replacements:
+            values.append(rule.source)
+            values.append(rule.target)
+        values.extend(self._terms)
+
+        tokens: set[str] = set()
+        for value in values:
+            for token in re.findall(r"[a-z0-9+#./_-]+", value.casefold()):
+                tokens.add(token)
+        return tokens
+

 def _build_replacement_pattern(sources: Iterable[str]) -> re.Pattern[str] | None:
    unique_sources = _dedupe_preserve_order(list(sources))