aman/tests/test_aman_benchmarks.py

import io
import json
import sys
import tempfile
import unittest
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import patch

ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

import aman_benchmarks
import aman_cli
from config import Config


class _FakeBenchEditorStage:
    def warmup(self):
        return

    def rewrite(self, transcript, *, language, dictionary_context):
        _ = dictionary_context
        return SimpleNamespace(
            final_text=f"[{language}] {transcript.strip()}",
            latency_ms=1.0,
            pass1_ms=0.5,
            pass2_ms=0.5,
        )


class AmanBenchmarksTests(unittest.TestCase):
    def test_bench_command_json_output(self):
        args = aman_cli.parse_cli_args(
            ["bench", "--text", "hello", "--repeat", "2", "--warmup", "0", "--json"]
        )
        out = io.StringIO()
        with patch("aman_benchmarks.load", return_value=Config()), patch(
            "aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
        ), patch("sys.stdout", out):
            exit_code = aman_benchmarks.bench_command(args)

        self.assertEqual(exit_code, 0)
        payload = json.loads(out.getvalue())
        self.assertEqual(payload["measured_runs"], 2)
        self.assertEqual(payload["summary"]["runs"], 2)
        self.assertEqual(len(payload["runs"]), 2)
        self.assertEqual(payload["editor_backend"], "local_llama_builtin")
        self.assertIn("avg_alignment_ms", payload["summary"])
        self.assertIn("avg_fact_guard_ms", payload["summary"])
        self.assertIn("alignment_applied", payload["runs"][0])
        self.assertIn("fact_guard_action", payload["runs"][0])

    def test_bench_command_supports_text_file_input(self):
        with tempfile.TemporaryDirectory() as td:
            text_file = Path(td) / "input.txt"
            text_file.write_text("hello from file", encoding="utf-8")
            args = aman_cli.parse_cli_args(
                ["bench", "--text-file", str(text_file), "--repeat", "1", "--warmup", "0", "--print-output"]
            )
            out = io.StringIO()
            with patch("aman_benchmarks.load", return_value=Config()), patch(
                "aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
            ), patch("sys.stdout", out):
                exit_code = aman_benchmarks.bench_command(args)

        self.assertEqual(exit_code, 0)
        self.assertIn("[auto] hello from file", out.getvalue())

    def test_bench_command_rejects_empty_input(self):
        args = aman_cli.parse_cli_args(["bench", "--text", "   "])
        with patch("aman_benchmarks.load", return_value=Config()), patch(
            "aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
        ):
            exit_code = aman_benchmarks.bench_command(args)

        self.assertEqual(exit_code, 1)

    def test_bench_command_rejects_non_positive_repeat(self):
        args = aman_cli.parse_cli_args(["bench", "--text", "hello", "--repeat", "0"])
        with patch("aman_benchmarks.load", return_value=Config()), patch(
            "aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
        ):
            exit_code = aman_benchmarks.bench_command(args)

        self.assertEqual(exit_code, 1)

    def test_eval_models_command_writes_report(self):
        with tempfile.TemporaryDirectory() as td:
            output_path = Path(td) / "report.json"
            args = aman_cli.parse_cli_args(
                [
                    "eval-models",
                    "--dataset",
                    "benchmarks/cleanup_dataset.jsonl",
                    "--matrix",
                    "benchmarks/model_matrix.small_first.json",
                    "--output",
                    str(output_path),
                    "--json",
                ]
            )
            out = io.StringIO()
            fake_report = {
                "models": [
                    {
                        "name": "base",
                        "best_param_set": {
                            "latency_ms": {"p50": 1000.0},
                            "quality": {"hybrid_score_avg": 0.8, "parse_valid_rate": 1.0},
                        },
                    }
                ],
                "winner_recommendation": {"name": "base", "reason": "test"},
            }
            with patch("aman_benchmarks.run_model_eval", return_value=fake_report), patch(
                "sys.stdout", out
            ):
                exit_code = aman_benchmarks.eval_models_command(args)
            self.assertEqual(exit_code, 0)
            self.assertTrue(output_path.exists())
            payload = json.loads(output_path.read_text(encoding="utf-8"))
            self.assertEqual(payload["winner_recommendation"]["name"], "base")

    def test_eval_models_command_forwards_heuristic_arguments(self):
        args = aman_cli.parse_cli_args(
            [
                "eval-models",
                "--dataset",
                "benchmarks/cleanup_dataset.jsonl",
                "--matrix",
                "benchmarks/model_matrix.small_first.json",
                "--heuristic-dataset",
                "benchmarks/heuristics_dataset.jsonl",
                "--heuristic-weight",
                "0.35",
                "--report-version",
                "2",
                "--json",
            ]
        )
        out = io.StringIO()
        fake_report = {
            "models": [{"name": "base", "best_param_set": {}}],
            "winner_recommendation": {"name": "base", "reason": "ok"},
        }
        with patch("aman_benchmarks.run_model_eval", return_value=fake_report) as run_eval_mock, patch(
            "sys.stdout", out
        ):
            exit_code = aman_benchmarks.eval_models_command(args)
        self.assertEqual(exit_code, 0)
        run_eval_mock.assert_called_once_with(
            "benchmarks/cleanup_dataset.jsonl",
            "benchmarks/model_matrix.small_first.json",
            heuristic_dataset_path="benchmarks/heuristics_dataset.jsonl",
            heuristic_weight=0.35,
            report_version=2,
            verbose=False,
        )

    def test_build_heuristic_dataset_command_json_output(self):
        args = aman_cli.parse_cli_args(
            [
                "build-heuristic-dataset",
                "--input",
                "benchmarks/heuristics_dataset.raw.jsonl",
                "--output",
                "benchmarks/heuristics_dataset.jsonl",
                "--json",
            ]
        )
        out = io.StringIO()
        summary = {
            "raw_rows": 4,
            "written_rows": 4,
            "generated_word_rows": 2,
            "output_path": "benchmarks/heuristics_dataset.jsonl",
        }
        with patch("aman_benchmarks.build_heuristic_dataset", return_value=summary), patch(
            "sys.stdout", out
        ):
            exit_code = aman_benchmarks.build_heuristic_dataset_command(args)
        self.assertEqual(exit_code, 0)
        payload = json.loads(out.getvalue())
        self.assertEqual(payload["written_rows"], 4)


if __name__ == "__main__":
    unittest.main()