import io import json import sys import tempfile import unittest from pathlib import Path from types import SimpleNamespace from unittest.mock import patch ROOT = Path(__file__).resolve().parents[1] SRC = ROOT / "src" if str(SRC) not in sys.path: sys.path.insert(0, str(SRC)) import aman_benchmarks import aman_cli from config import Config class _FakeBenchEditorStage: def warmup(self): return def rewrite(self, transcript, *, language, dictionary_context): _ = dictionary_context return SimpleNamespace( final_text=f"[{language}] {transcript.strip()}", latency_ms=1.0, pass1_ms=0.5, pass2_ms=0.5, ) class AmanBenchmarksTests(unittest.TestCase): def test_bench_command_json_output(self): args = aman_cli.parse_cli_args( ["bench", "--text", "hello", "--repeat", "2", "--warmup", "0", "--json"] ) out = io.StringIO() with patch("aman_benchmarks.load", return_value=Config()), patch( "aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage() ), patch("sys.stdout", out): exit_code = aman_benchmarks.bench_command(args) self.assertEqual(exit_code, 0) payload = json.loads(out.getvalue()) self.assertEqual(payload["measured_runs"], 2) self.assertEqual(payload["summary"]["runs"], 2) self.assertEqual(len(payload["runs"]), 2) self.assertEqual(payload["editor_backend"], "local_llama_builtin") self.assertIn("avg_alignment_ms", payload["summary"]) self.assertIn("avg_fact_guard_ms", payload["summary"]) self.assertIn("alignment_applied", payload["runs"][0]) self.assertIn("fact_guard_action", payload["runs"][0]) def test_bench_command_supports_text_file_input(self): with tempfile.TemporaryDirectory() as td: text_file = Path(td) / "input.txt" text_file.write_text("hello from file", encoding="utf-8") args = aman_cli.parse_cli_args( ["bench", "--text-file", str(text_file), "--repeat", "1", "--warmup", "0", "--print-output"] ) out = io.StringIO() with patch("aman_benchmarks.load", return_value=Config()), patch( "aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage() ), patch("sys.stdout", out): exit_code = aman_benchmarks.bench_command(args) self.assertEqual(exit_code, 0) self.assertIn("[auto] hello from file", out.getvalue()) def test_bench_command_rejects_empty_input(self): args = aman_cli.parse_cli_args(["bench", "--text", " "]) with patch("aman_benchmarks.load", return_value=Config()), patch( "aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage() ): exit_code = aman_benchmarks.bench_command(args) self.assertEqual(exit_code, 1) def test_bench_command_rejects_non_positive_repeat(self): args = aman_cli.parse_cli_args(["bench", "--text", "hello", "--repeat", "0"]) with patch("aman_benchmarks.load", return_value=Config()), patch( "aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage() ): exit_code = aman_benchmarks.bench_command(args) self.assertEqual(exit_code, 1) def test_eval_models_command_writes_report(self): with tempfile.TemporaryDirectory() as td: output_path = Path(td) / "report.json" args = aman_cli.parse_cli_args( [ "eval-models", "--dataset", "benchmarks/cleanup_dataset.jsonl", "--matrix", "benchmarks/model_matrix.small_first.json", "--output", str(output_path), "--json", ] ) out = io.StringIO() fake_report = { "models": [ { "name": "base", "best_param_set": { "latency_ms": {"p50": 1000.0}, "quality": {"hybrid_score_avg": 0.8, "parse_valid_rate": 1.0}, }, } ], "winner_recommendation": {"name": "base", "reason": "test"}, } with patch("aman_benchmarks.run_model_eval", return_value=fake_report), patch( "sys.stdout", out ): exit_code = aman_benchmarks.eval_models_command(args) self.assertEqual(exit_code, 0) self.assertTrue(output_path.exists()) payload = json.loads(output_path.read_text(encoding="utf-8")) self.assertEqual(payload["winner_recommendation"]["name"], "base") def test_eval_models_command_forwards_heuristic_arguments(self): args = aman_cli.parse_cli_args( [ "eval-models", "--dataset", "benchmarks/cleanup_dataset.jsonl", "--matrix", "benchmarks/model_matrix.small_first.json", "--heuristic-dataset", "benchmarks/heuristics_dataset.jsonl", "--heuristic-weight", "0.35", "--report-version", "2", "--json", ] ) out = io.StringIO() fake_report = { "models": [{"name": "base", "best_param_set": {}}], "winner_recommendation": {"name": "base", "reason": "ok"}, } with patch("aman_benchmarks.run_model_eval", return_value=fake_report) as run_eval_mock, patch( "sys.stdout", out ): exit_code = aman_benchmarks.eval_models_command(args) self.assertEqual(exit_code, 0) run_eval_mock.assert_called_once_with( "benchmarks/cleanup_dataset.jsonl", "benchmarks/model_matrix.small_first.json", heuristic_dataset_path="benchmarks/heuristics_dataset.jsonl", heuristic_weight=0.35, report_version=2, verbose=False, ) def test_build_heuristic_dataset_command_json_output(self): args = aman_cli.parse_cli_args( [ "build-heuristic-dataset", "--input", "benchmarks/heuristics_dataset.raw.jsonl", "--output", "benchmarks/heuristics_dataset.jsonl", "--json", ] ) out = io.StringIO() summary = { "raw_rows": 4, "written_rows": 4, "generated_word_rows": 2, "output_path": "benchmarks/heuristics_dataset.jsonl", } with patch("aman_benchmarks.build_heuristic_dataset", return_value=summary), patch( "sys.stdout", out ): exit_code = aman_benchmarks.build_heuristic_dataset_command(args) self.assertEqual(exit_code, 0) payload = json.loads(out.getvalue()) self.assertEqual(payload["written_rows"], 4) if __name__ == "__main__": unittest.main()