Add benchmark-driven model promotion workflow and pipeline stages

2026-02-28 15:12:33 -03:00 · 2026-02-28 15:12:33 -03:00 · 8c1f7c1e13
commit 8c1f7c1e13
parent 98b13d1069
38 changed files with 5300 additions and 503 deletions
--- a/tests/test_aman_cli.py
+++ b/tests/test_aman_cli.py
@ -4,6 +4,7 @@ import sys
 import tempfile
 import unittest
 from pathlib import Path
+from types import SimpleNamespace
 from unittest.mock import patch

 ROOT = Path(__file__).resolve().parents[1]
@ -92,6 +93,20 @@ class _RetrySetupDesktop(_FakeDesktop):
        on_quit()


+class _FakeBenchEditorStage:
+    def warmup(self):
+        return
+
+    def rewrite(self, transcript, *, language, dictionary_context):
+        _ = dictionary_context
+        return SimpleNamespace(
+            final_text=f"[{language}] {transcript.strip()}",
+            latency_ms=1.0,
+            pass1_ms=0.5,
+            pass2_ms=0.5,
+        )
+
+
 class AmanCliTests(unittest.TestCase):
    def test_parse_cli_args_defaults_to_run_command(self):
        args = aman._parse_cli_args(["--dry-run"])
@ -111,6 +126,85 @@ class AmanCliTests(unittest.TestCase):
        self.assertEqual(args.command, "self-check")
        self.assertTrue(args.json)

+    def test_parse_cli_args_bench_command(self):
+        args = aman._parse_cli_args(
+            ["bench", "--text", "hello", "--repeat", "2", "--warmup", "0", "--json"]
+        )
+
+        self.assertEqual(args.command, "bench")
+        self.assertEqual(args.text, "hello")
+        self.assertEqual(args.repeat, 2)
+        self.assertEqual(args.warmup, 0)
+        self.assertTrue(args.json)
+
+    def test_parse_cli_args_bench_requires_input(self):
+        with self.assertRaises(SystemExit):
+            aman._parse_cli_args(["bench"])
+
+    def test_parse_cli_args_eval_models_command(self):
+        args = aman._parse_cli_args(
+            ["eval-models", "--dataset", "benchmarks/cleanup_dataset.jsonl", "--matrix", "benchmarks/model_matrix.small_first.json"]
+        )
+        self.assertEqual(args.command, "eval-models")
+        self.assertEqual(args.dataset, "benchmarks/cleanup_dataset.jsonl")
+        self.assertEqual(args.matrix, "benchmarks/model_matrix.small_first.json")
+        self.assertEqual(args.heuristic_dataset, "")
+        self.assertEqual(args.heuristic_weight, 0.25)
+        self.assertEqual(args.report_version, 2)
+
+    def test_parse_cli_args_eval_models_with_heuristic_options(self):
+        args = aman._parse_cli_args(
+            [
+                "eval-models",
+                "--dataset",
+                "benchmarks/cleanup_dataset.jsonl",
+                "--matrix",
+                "benchmarks/model_matrix.small_first.json",
+                "--heuristic-dataset",
+                "benchmarks/heuristics_dataset.jsonl",
+                "--heuristic-weight",
+                "0.4",
+                "--report-version",
+                "2",
+            ]
+        )
+        self.assertEqual(args.heuristic_dataset, "benchmarks/heuristics_dataset.jsonl")
+        self.assertEqual(args.heuristic_weight, 0.4)
+        self.assertEqual(args.report_version, 2)
+
+    def test_parse_cli_args_build_heuristic_dataset_command(self):
+        args = aman._parse_cli_args(
+            [
+                "build-heuristic-dataset",
+                "--input",
+                "benchmarks/heuristics_dataset.raw.jsonl",
+                "--output",
+                "benchmarks/heuristics_dataset.jsonl",
+            ]
+        )
+        self.assertEqual(args.command, "build-heuristic-dataset")
+        self.assertEqual(args.input, "benchmarks/heuristics_dataset.raw.jsonl")
+        self.assertEqual(args.output, "benchmarks/heuristics_dataset.jsonl")
+
+    def test_parse_cli_args_sync_default_model_command(self):
+        args = aman._parse_cli_args(
+            [
+                "sync-default-model",
+                "--report",
+                "benchmarks/results/latest.json",
+                "--artifacts",
+                "benchmarks/model_artifacts.json",
+                "--constants",
+                "src/constants.py",
+                "--check",
+            ]
+        )
+        self.assertEqual(args.command, "sync-default-model")
+        self.assertEqual(args.report, "benchmarks/results/latest.json")
+        self.assertEqual(args.artifacts, "benchmarks/model_artifacts.json")
+        self.assertEqual(args.constants, "src/constants.py")
+        self.assertTrue(args.check)
+
    def test_version_command_prints_version(self):
        out = io.StringIO()
        args = aman._parse_cli_args(["version"])
@ -145,6 +239,259 @@ class AmanCliTests(unittest.TestCase):
        self.assertEqual(exit_code, 2)
        self.assertIn("[FAIL] config.load", out.getvalue())

+    def test_bench_command_json_output(self):
+        args = aman._parse_cli_args(["bench", "--text", "hello", "--repeat", "2", "--warmup", "0", "--json"])
+        out = io.StringIO()
+        with patch("aman.load", return_value=Config()), patch(
+            "aman._build_editor_stage", return_value=_FakeBenchEditorStage()
+        ), patch("sys.stdout", out):
+            exit_code = aman._bench_command(args)
+
+        self.assertEqual(exit_code, 0)
+        payload = json.loads(out.getvalue())
+        self.assertEqual(payload["measured_runs"], 2)
+        self.assertEqual(payload["summary"]["runs"], 2)
+        self.assertEqual(len(payload["runs"]), 2)
+        self.assertEqual(payload["editor_backend"], "local_llama_builtin")
+        self.assertIn("avg_alignment_ms", payload["summary"])
+        self.assertIn("avg_fact_guard_ms", payload["summary"])
+        self.assertIn("alignment_applied", payload["runs"][0])
+        self.assertIn("fact_guard_action", payload["runs"][0])
+
+    def test_bench_command_supports_text_file_input(self):
+        with tempfile.TemporaryDirectory() as td:
+            text_file = Path(td) / "input.txt"
+            text_file.write_text("hello from file", encoding="utf-8")
+            args = aman._parse_cli_args(
+                ["bench", "--text-file", str(text_file), "--repeat", "1", "--warmup", "0", "--print-output"]
+            )
+            out = io.StringIO()
+            with patch("aman.load", return_value=Config()), patch(
+                "aman._build_editor_stage", return_value=_FakeBenchEditorStage()
+            ), patch("sys.stdout", out):
+                exit_code = aman._bench_command(args)
+
+        self.assertEqual(exit_code, 0)
+        self.assertIn("[auto] hello from file", out.getvalue())
+
+    def test_bench_command_rejects_empty_input(self):
+        args = aman._parse_cli_args(["bench", "--text", "   "])
+        with patch("aman.load", return_value=Config()), patch(
+            "aman._build_editor_stage", return_value=_FakeBenchEditorStage()
+        ):
+            exit_code = aman._bench_command(args)
+
+        self.assertEqual(exit_code, 1)
+
+    def test_bench_command_rejects_non_positive_repeat(self):
+        args = aman._parse_cli_args(["bench", "--text", "hello", "--repeat", "0"])
+        with patch("aman.load", return_value=Config()), patch(
+            "aman._build_editor_stage", return_value=_FakeBenchEditorStage()
+        ):
+            exit_code = aman._bench_command(args)
+
+        self.assertEqual(exit_code, 1)
+
+    def test_eval_models_command_writes_report(self):
+        with tempfile.TemporaryDirectory() as td:
+            output_path = Path(td) / "report.json"
+            args = aman._parse_cli_args(
+                [
+                    "eval-models",
+                    "--dataset",
+                    "benchmarks/cleanup_dataset.jsonl",
+                    "--matrix",
+                    "benchmarks/model_matrix.small_first.json",
+                    "--output",
+                    str(output_path),
+                    "--json",
+                ]
+            )
+            out = io.StringIO()
+            fake_report = {
+                "models": [{"name": "base", "best_param_set": {"latency_ms": {"p50": 1000.0}, "quality": {"hybrid_score_avg": 0.8, "parse_valid_rate": 1.0}}}],
+                "winner_recommendation": {"name": "base", "reason": "test"},
+            }
+            with patch("aman.run_model_eval", return_value=fake_report), patch("sys.stdout", out):
+                exit_code = aman._eval_models_command(args)
+            self.assertEqual(exit_code, 0)
+            self.assertTrue(output_path.exists())
+            payload = json.loads(output_path.read_text(encoding="utf-8"))
+            self.assertEqual(payload["winner_recommendation"]["name"], "base")
+
+    def test_eval_models_command_forwards_heuristic_arguments(self):
+        args = aman._parse_cli_args(
+            [
+                "eval-models",
+                "--dataset",
+                "benchmarks/cleanup_dataset.jsonl",
+                "--matrix",
+                "benchmarks/model_matrix.small_first.json",
+                "--heuristic-dataset",
+                "benchmarks/heuristics_dataset.jsonl",
+                "--heuristic-weight",
+                "0.35",
+                "--report-version",
+                "2",
+                "--json",
+            ]
+        )
+        out = io.StringIO()
+        fake_report = {
+            "models": [{"name": "base", "best_param_set": {}}],
+            "winner_recommendation": {"name": "base", "reason": "ok"},
+        }
+        with patch("aman.run_model_eval", return_value=fake_report) as run_eval_mock, patch(
+            "sys.stdout", out
+        ):
+            exit_code = aman._eval_models_command(args)
+        self.assertEqual(exit_code, 0)
+        run_eval_mock.assert_called_once_with(
+            "benchmarks/cleanup_dataset.jsonl",
+            "benchmarks/model_matrix.small_first.json",
+            heuristic_dataset_path="benchmarks/heuristics_dataset.jsonl",
+            heuristic_weight=0.35,
+            report_version=2,
+            verbose=False,
+        )
+
+    def test_build_heuristic_dataset_command_json_output(self):
+        args = aman._parse_cli_args(
+            [
+                "build-heuristic-dataset",
+                "--input",
+                "benchmarks/heuristics_dataset.raw.jsonl",
+                "--output",
+                "benchmarks/heuristics_dataset.jsonl",
+                "--json",
+            ]
+        )
+        out = io.StringIO()
+        summary = {
+            "raw_rows": 4,
+            "written_rows": 4,
+            "generated_word_rows": 2,
+            "output_path": "benchmarks/heuristics_dataset.jsonl",
+        }
+        with patch("aman.build_heuristic_dataset", return_value=summary), patch("sys.stdout", out):
+            exit_code = aman._build_heuristic_dataset_command(args)
+        self.assertEqual(exit_code, 0)
+        payload = json.loads(out.getvalue())
+        self.assertEqual(payload["written_rows"], 4)
+
+    def test_sync_default_model_command_updates_constants(self):
+        with tempfile.TemporaryDirectory() as td:
+            report_path = Path(td) / "latest.json"
+            artifacts_path = Path(td) / "artifacts.json"
+            constants_path = Path(td) / "constants.py"
+            report_path.write_text(
+                json.dumps(
+                    {
+                        "winner_recommendation": {
+                            "name": "test-model",
+                        }
+                    }
+                ),
+                encoding="utf-8",
+            )
+            artifacts_path.write_text(
+                json.dumps(
+                    {
+                        "models": [
+                            {
+                                "name": "test-model",
+                                "filename": "winner.gguf",
+                                "url": "https://example.invalid/winner.gguf",
+                                "sha256": "a" * 64,
+                            }
+                        ]
+                    }
+                ),
+                encoding="utf-8",
+            )
+            constants_path.write_text(
+                (
+                    'MODEL_NAME = "old.gguf"\n'
+                    'MODEL_URL = "https://example.invalid/old.gguf"\n'
+                    'MODEL_SHA256 = "' + ("b" * 64) + '"\n'
+                ),
+                encoding="utf-8",
+            )
+
+            args = aman._parse_cli_args(
+                [
+                    "sync-default-model",
+                    "--report",
+                    str(report_path),
+                    "--artifacts",
+                    str(artifacts_path),
+                    "--constants",
+                    str(constants_path),
+                ]
+            )
+            exit_code = aman._sync_default_model_command(args)
+            self.assertEqual(exit_code, 0)
+            updated = constants_path.read_text(encoding="utf-8")
+            self.assertIn('MODEL_NAME = "winner.gguf"', updated)
+            self.assertIn('MODEL_URL = "https://example.invalid/winner.gguf"', updated)
+            self.assertIn('MODEL_SHA256 = "' + ("a" * 64) + '"', updated)
+
+    def test_sync_default_model_command_check_mode_returns_2_on_drift(self):
+        with tempfile.TemporaryDirectory() as td:
+            report_path = Path(td) / "latest.json"
+            artifacts_path = Path(td) / "artifacts.json"
+            constants_path = Path(td) / "constants.py"
+            report_path.write_text(
+                json.dumps(
+                    {
+                        "winner_recommendation": {
+                            "name": "test-model",
+                        }
+                    }
+                ),
+                encoding="utf-8",
+            )
+            artifacts_path.write_text(
+                json.dumps(
+                    {
+                        "models": [
+                            {
+                                "name": "test-model",
+                                "filename": "winner.gguf",
+                                "url": "https://example.invalid/winner.gguf",
+                                "sha256": "a" * 64,
+                            }
+                        ]
+                    }
+                ),
+                encoding="utf-8",
+            )
+            constants_path.write_text(
+                (
+                    'MODEL_NAME = "old.gguf"\n'
+                    'MODEL_URL = "https://example.invalid/old.gguf"\n'
+                    'MODEL_SHA256 = "' + ("b" * 64) + '"\n'
+                ),
+                encoding="utf-8",
+            )
+
+            args = aman._parse_cli_args(
+                [
+                    "sync-default-model",
+                    "--report",
+                    str(report_path),
+                    "--artifacts",
+                    str(artifacts_path),
+                    "--constants",
+                    str(constants_path),
+                    "--check",
+                ]
+            )
+            exit_code = aman._sync_default_model_command(args)
+            self.assertEqual(exit_code, 2)
+            updated = constants_path.read_text(encoding="utf-8")
+            self.assertIn('MODEL_NAME = "old.gguf"', updated)
+
    def test_init_command_creates_default_config(self):
        with tempfile.TemporaryDirectory() as td:
            path = Path(td) / "config.json"