aman/tests/test_aman_benchmarks.py
Thales Maciel 4d0081d1d0 Split aman.py into focused CLI and runtime modules
Break the old god module into flat siblings for CLI parsing, run lifecycle, daemon state, shared processing helpers, benchmark tooling, and maintainer-only model sync so changes stop sharing one giant import graph.

Keep aman as a thin shim over aman_cli, move sync-default-model behind the hidden aman-maint entrypoint plus Make wrappers, and update packaging metadata plus maintainer docs to reflect the new surface.

Retarget the tests to the new seams with dedicated runtime, run, benchmark, maintainer, and entrypoint suites, and verify with python3 -m unittest discover -s tests -p "test_*.py", python3 -m py_compile src/*.py tests/*.py, PYTHONPATH=src python3 -m aman --help, PYTHONPATH=src python3 -m aman version, and PYTHONPATH=src python3 -m aman_maint --help.
2026-03-14 14:54:57 -03:00

191 lines
7.1 KiB
Python

import io
import json
import sys
import tempfile
import unittest
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import patch
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
import aman_benchmarks
import aman_cli
from config import Config
class _FakeBenchEditorStage:
def warmup(self):
return
def rewrite(self, transcript, *, language, dictionary_context):
_ = dictionary_context
return SimpleNamespace(
final_text=f"[{language}] {transcript.strip()}",
latency_ms=1.0,
pass1_ms=0.5,
pass2_ms=0.5,
)
class AmanBenchmarksTests(unittest.TestCase):
def test_bench_command_json_output(self):
args = aman_cli.parse_cli_args(
["bench", "--text", "hello", "--repeat", "2", "--warmup", "0", "--json"]
)
out = io.StringIO()
with patch("aman_benchmarks.load", return_value=Config()), patch(
"aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
), patch("sys.stdout", out):
exit_code = aman_benchmarks.bench_command(args)
self.assertEqual(exit_code, 0)
payload = json.loads(out.getvalue())
self.assertEqual(payload["measured_runs"], 2)
self.assertEqual(payload["summary"]["runs"], 2)
self.assertEqual(len(payload["runs"]), 2)
self.assertEqual(payload["editor_backend"], "local_llama_builtin")
self.assertIn("avg_alignment_ms", payload["summary"])
self.assertIn("avg_fact_guard_ms", payload["summary"])
self.assertIn("alignment_applied", payload["runs"][0])
self.assertIn("fact_guard_action", payload["runs"][0])
def test_bench_command_supports_text_file_input(self):
with tempfile.TemporaryDirectory() as td:
text_file = Path(td) / "input.txt"
text_file.write_text("hello from file", encoding="utf-8")
args = aman_cli.parse_cli_args(
["bench", "--text-file", str(text_file), "--repeat", "1", "--warmup", "0", "--print-output"]
)
out = io.StringIO()
with patch("aman_benchmarks.load", return_value=Config()), patch(
"aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
), patch("sys.stdout", out):
exit_code = aman_benchmarks.bench_command(args)
self.assertEqual(exit_code, 0)
self.assertIn("[auto] hello from file", out.getvalue())
def test_bench_command_rejects_empty_input(self):
args = aman_cli.parse_cli_args(["bench", "--text", " "])
with patch("aman_benchmarks.load", return_value=Config()), patch(
"aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
):
exit_code = aman_benchmarks.bench_command(args)
self.assertEqual(exit_code, 1)
def test_bench_command_rejects_non_positive_repeat(self):
args = aman_cli.parse_cli_args(["bench", "--text", "hello", "--repeat", "0"])
with patch("aman_benchmarks.load", return_value=Config()), patch(
"aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
):
exit_code = aman_benchmarks.bench_command(args)
self.assertEqual(exit_code, 1)
def test_eval_models_command_writes_report(self):
with tempfile.TemporaryDirectory() as td:
output_path = Path(td) / "report.json"
args = aman_cli.parse_cli_args(
[
"eval-models",
"--dataset",
"benchmarks/cleanup_dataset.jsonl",
"--matrix",
"benchmarks/model_matrix.small_first.json",
"--output",
str(output_path),
"--json",
]
)
out = io.StringIO()
fake_report = {
"models": [
{
"name": "base",
"best_param_set": {
"latency_ms": {"p50": 1000.0},
"quality": {"hybrid_score_avg": 0.8, "parse_valid_rate": 1.0},
},
}
],
"winner_recommendation": {"name": "base", "reason": "test"},
}
with patch("aman_benchmarks.run_model_eval", return_value=fake_report), patch(
"sys.stdout", out
):
exit_code = aman_benchmarks.eval_models_command(args)
self.assertEqual(exit_code, 0)
self.assertTrue(output_path.exists())
payload = json.loads(output_path.read_text(encoding="utf-8"))
self.assertEqual(payload["winner_recommendation"]["name"], "base")
def test_eval_models_command_forwards_heuristic_arguments(self):
args = aman_cli.parse_cli_args(
[
"eval-models",
"--dataset",
"benchmarks/cleanup_dataset.jsonl",
"--matrix",
"benchmarks/model_matrix.small_first.json",
"--heuristic-dataset",
"benchmarks/heuristics_dataset.jsonl",
"--heuristic-weight",
"0.35",
"--report-version",
"2",
"--json",
]
)
out = io.StringIO()
fake_report = {
"models": [{"name": "base", "best_param_set": {}}],
"winner_recommendation": {"name": "base", "reason": "ok"},
}
with patch("aman_benchmarks.run_model_eval", return_value=fake_report) as run_eval_mock, patch(
"sys.stdout", out
):
exit_code = aman_benchmarks.eval_models_command(args)
self.assertEqual(exit_code, 0)
run_eval_mock.assert_called_once_with(
"benchmarks/cleanup_dataset.jsonl",
"benchmarks/model_matrix.small_first.json",
heuristic_dataset_path="benchmarks/heuristics_dataset.jsonl",
heuristic_weight=0.35,
report_version=2,
verbose=False,
)
def test_build_heuristic_dataset_command_json_output(self):
args = aman_cli.parse_cli_args(
[
"build-heuristic-dataset",
"--input",
"benchmarks/heuristics_dataset.raw.jsonl",
"--output",
"benchmarks/heuristics_dataset.jsonl",
"--json",
]
)
out = io.StringIO()
summary = {
"raw_rows": 4,
"written_rows": 4,
"generated_word_rows": 2,
"output_path": "benchmarks/heuristics_dataset.jsonl",
}
with patch("aman_benchmarks.build_heuristic_dataset", return_value=summary), patch(
"sys.stdout", out
):
exit_code = aman_benchmarks.build_heuristic_dataset_command(args)
self.assertEqual(exit_code, 0)
payload = json.loads(out.getvalue())
self.assertEqual(payload["written_rows"], 4)
if __name__ == "__main__":
unittest.main()