Add vocabulary correction pipeline and example config

This commit is contained in:
Thales Maciel 2026-02-25 10:03:32 -03:00
parent f9224621fa
commit c3503fbbde
9 changed files with 865 additions and 23 deletions

View file

@ -27,6 +27,12 @@ class ConfigTests(unittest.TestCase):
self.assertEqual(cfg.injection.backend, "clipboard")
self.assertTrue(cfg.ai.enabled)
self.assertFalse(cfg.logging.log_transcript)
self.assertEqual(cfg.vocabulary.replacements, [])
self.assertEqual(cfg.vocabulary.terms, [])
self.assertEqual(cfg.vocabulary.max_rules, 500)
self.assertEqual(cfg.vocabulary.max_terms, 500)
self.assertTrue(cfg.domain_inference.enabled)
self.assertEqual(cfg.domain_inference.mode, "auto")
def test_loads_nested_config(self):
payload = {
@ -36,6 +42,16 @@ class ConfigTests(unittest.TestCase):
"injection": {"backend": "injection"},
"ai": {"enabled": False},
"logging": {"log_transcript": True},
"vocabulary": {
"replacements": [
{"from": "Martha", "to": "Marta"},
{"from": "docker", "to": "Docker"},
],
"terms": ["Systemd", "Kubernetes"],
"max_rules": 100,
"max_terms": 200,
},
"domain_inference": {"enabled": True, "mode": "auto"},
}
with tempfile.TemporaryDirectory() as td:
path = Path(td) / "config.json"
@ -50,6 +66,14 @@ class ConfigTests(unittest.TestCase):
self.assertEqual(cfg.injection.backend, "injection")
self.assertFalse(cfg.ai.enabled)
self.assertTrue(cfg.logging.log_transcript)
self.assertEqual(cfg.vocabulary.max_rules, 100)
self.assertEqual(cfg.vocabulary.max_terms, 200)
self.assertEqual(len(cfg.vocabulary.replacements), 2)
self.assertEqual(cfg.vocabulary.replacements[0].source, "Martha")
self.assertEqual(cfg.vocabulary.replacements[0].target, "Marta")
self.assertEqual(cfg.vocabulary.terms, ["Systemd", "Kubernetes"])
self.assertTrue(cfg.domain_inference.enabled)
self.assertEqual(cfg.domain_inference.mode, "auto")
def test_loads_legacy_keys(self):
payload = {
@ -74,6 +98,7 @@ class ConfigTests(unittest.TestCase):
self.assertEqual(cfg.injection.backend, "clipboard")
self.assertFalse(cfg.ai.enabled)
self.assertTrue(cfg.logging.log_transcript)
self.assertEqual(cfg.vocabulary.replacements, [])
def test_invalid_injection_backend_raises(self):
payload = {"injection": {"backend": "invalid"}}
@ -93,6 +118,65 @@ class ConfigTests(unittest.TestCase):
with self.assertRaisesRegex(ValueError, "logging.log_transcript"):
load(str(path))
def test_conflicting_replacements_raise(self):
payload = {
"vocabulary": {
"replacements": [
{"from": "Martha", "to": "Marta"},
{"from": "martha", "to": "Martha"},
]
}
}
with tempfile.TemporaryDirectory() as td:
path = Path(td) / "config.json"
path.write_text(json.dumps(payload), encoding="utf-8")
with self.assertRaisesRegex(ValueError, "conflicting"):
load(str(path))
def test_duplicate_rules_and_terms_are_deduplicated(self):
payload = {
"vocabulary": {
"replacements": [
{"from": "docker", "to": "Docker"},
{"from": "DOCKER", "to": "Docker"},
],
"terms": ["Systemd", "systemd"],
}
}
with tempfile.TemporaryDirectory() as td:
path = Path(td) / "config.json"
path.write_text(json.dumps(payload), encoding="utf-8")
cfg = load(str(path))
self.assertEqual(len(cfg.vocabulary.replacements), 1)
self.assertEqual(cfg.vocabulary.replacements[0].source, "docker")
self.assertEqual(cfg.vocabulary.replacements[0].target, "Docker")
self.assertEqual(cfg.vocabulary.terms, ["Systemd"])
def test_wildcard_term_raises(self):
payload = {
"vocabulary": {
"terms": ["Dock*"],
}
}
with tempfile.TemporaryDirectory() as td:
path = Path(td) / "config.json"
path.write_text(json.dumps(payload), encoding="utf-8")
with self.assertRaisesRegex(ValueError, "wildcard"):
load(str(path))
def test_invalid_domain_mode_raises(self):
payload = {"domain_inference": {"mode": "heuristic"}}
with tempfile.TemporaryDirectory() as td:
path = Path(td) / "config.json"
path.write_text(json.dumps(payload), encoding="utf-8")
with self.assertRaisesRegex(ValueError, "domain_inference.mode"):
load(str(path))
if __name__ == "__main__":
unittest.main()