Break the old god module into flat siblings for CLI parsing, run lifecycle, daemon state, shared processing helpers, benchmark tooling, and maintainer-only model sync so changes stop sharing one giant import graph. Keep aman as a thin shim over aman_cli, move sync-default-model behind the hidden aman-maint entrypoint plus Make wrappers, and update packaging metadata plus maintainer docs to reflect the new surface. Retarget the tests to the new seams with dedicated runtime, run, benchmark, maintainer, and entrypoint suites, and verify with python3 -m unittest discover -s tests -p "test_*.py", python3 -m py_compile src/*.py tests/*.py, PYTHONPATH=src python3 -m aman --help, PYTHONPATH=src python3 -m aman version, and PYTHONPATH=src python3 -m aman_maint --help.
191 lines
7.1 KiB
Python
191 lines
7.1 KiB
Python
import io
|
|
import json
|
|
import sys
|
|
import tempfile
|
|
import unittest
|
|
from pathlib import Path
|
|
from types import SimpleNamespace
|
|
from unittest.mock import patch
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
SRC = ROOT / "src"
|
|
if str(SRC) not in sys.path:
|
|
sys.path.insert(0, str(SRC))
|
|
|
|
import aman_benchmarks
|
|
import aman_cli
|
|
from config import Config
|
|
|
|
|
|
class _FakeBenchEditorStage:
|
|
def warmup(self):
|
|
return
|
|
|
|
def rewrite(self, transcript, *, language, dictionary_context):
|
|
_ = dictionary_context
|
|
return SimpleNamespace(
|
|
final_text=f"[{language}] {transcript.strip()}",
|
|
latency_ms=1.0,
|
|
pass1_ms=0.5,
|
|
pass2_ms=0.5,
|
|
)
|
|
|
|
|
|
class AmanBenchmarksTests(unittest.TestCase):
|
|
def test_bench_command_json_output(self):
|
|
args = aman_cli.parse_cli_args(
|
|
["bench", "--text", "hello", "--repeat", "2", "--warmup", "0", "--json"]
|
|
)
|
|
out = io.StringIO()
|
|
with patch("aman_benchmarks.load", return_value=Config()), patch(
|
|
"aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
|
|
), patch("sys.stdout", out):
|
|
exit_code = aman_benchmarks.bench_command(args)
|
|
|
|
self.assertEqual(exit_code, 0)
|
|
payload = json.loads(out.getvalue())
|
|
self.assertEqual(payload["measured_runs"], 2)
|
|
self.assertEqual(payload["summary"]["runs"], 2)
|
|
self.assertEqual(len(payload["runs"]), 2)
|
|
self.assertEqual(payload["editor_backend"], "local_llama_builtin")
|
|
self.assertIn("avg_alignment_ms", payload["summary"])
|
|
self.assertIn("avg_fact_guard_ms", payload["summary"])
|
|
self.assertIn("alignment_applied", payload["runs"][0])
|
|
self.assertIn("fact_guard_action", payload["runs"][0])
|
|
|
|
def test_bench_command_supports_text_file_input(self):
|
|
with tempfile.TemporaryDirectory() as td:
|
|
text_file = Path(td) / "input.txt"
|
|
text_file.write_text("hello from file", encoding="utf-8")
|
|
args = aman_cli.parse_cli_args(
|
|
["bench", "--text-file", str(text_file), "--repeat", "1", "--warmup", "0", "--print-output"]
|
|
)
|
|
out = io.StringIO()
|
|
with patch("aman_benchmarks.load", return_value=Config()), patch(
|
|
"aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
|
|
), patch("sys.stdout", out):
|
|
exit_code = aman_benchmarks.bench_command(args)
|
|
|
|
self.assertEqual(exit_code, 0)
|
|
self.assertIn("[auto] hello from file", out.getvalue())
|
|
|
|
def test_bench_command_rejects_empty_input(self):
|
|
args = aman_cli.parse_cli_args(["bench", "--text", " "])
|
|
with patch("aman_benchmarks.load", return_value=Config()), patch(
|
|
"aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
|
|
):
|
|
exit_code = aman_benchmarks.bench_command(args)
|
|
|
|
self.assertEqual(exit_code, 1)
|
|
|
|
def test_bench_command_rejects_non_positive_repeat(self):
|
|
args = aman_cli.parse_cli_args(["bench", "--text", "hello", "--repeat", "0"])
|
|
with patch("aman_benchmarks.load", return_value=Config()), patch(
|
|
"aman_benchmarks.build_editor_stage", return_value=_FakeBenchEditorStage()
|
|
):
|
|
exit_code = aman_benchmarks.bench_command(args)
|
|
|
|
self.assertEqual(exit_code, 1)
|
|
|
|
def test_eval_models_command_writes_report(self):
|
|
with tempfile.TemporaryDirectory() as td:
|
|
output_path = Path(td) / "report.json"
|
|
args = aman_cli.parse_cli_args(
|
|
[
|
|
"eval-models",
|
|
"--dataset",
|
|
"benchmarks/cleanup_dataset.jsonl",
|
|
"--matrix",
|
|
"benchmarks/model_matrix.small_first.json",
|
|
"--output",
|
|
str(output_path),
|
|
"--json",
|
|
]
|
|
)
|
|
out = io.StringIO()
|
|
fake_report = {
|
|
"models": [
|
|
{
|
|
"name": "base",
|
|
"best_param_set": {
|
|
"latency_ms": {"p50": 1000.0},
|
|
"quality": {"hybrid_score_avg": 0.8, "parse_valid_rate": 1.0},
|
|
},
|
|
}
|
|
],
|
|
"winner_recommendation": {"name": "base", "reason": "test"},
|
|
}
|
|
with patch("aman_benchmarks.run_model_eval", return_value=fake_report), patch(
|
|
"sys.stdout", out
|
|
):
|
|
exit_code = aman_benchmarks.eval_models_command(args)
|
|
self.assertEqual(exit_code, 0)
|
|
self.assertTrue(output_path.exists())
|
|
payload = json.loads(output_path.read_text(encoding="utf-8"))
|
|
self.assertEqual(payload["winner_recommendation"]["name"], "base")
|
|
|
|
def test_eval_models_command_forwards_heuristic_arguments(self):
|
|
args = aman_cli.parse_cli_args(
|
|
[
|
|
"eval-models",
|
|
"--dataset",
|
|
"benchmarks/cleanup_dataset.jsonl",
|
|
"--matrix",
|
|
"benchmarks/model_matrix.small_first.json",
|
|
"--heuristic-dataset",
|
|
"benchmarks/heuristics_dataset.jsonl",
|
|
"--heuristic-weight",
|
|
"0.35",
|
|
"--report-version",
|
|
"2",
|
|
"--json",
|
|
]
|
|
)
|
|
out = io.StringIO()
|
|
fake_report = {
|
|
"models": [{"name": "base", "best_param_set": {}}],
|
|
"winner_recommendation": {"name": "base", "reason": "ok"},
|
|
}
|
|
with patch("aman_benchmarks.run_model_eval", return_value=fake_report) as run_eval_mock, patch(
|
|
"sys.stdout", out
|
|
):
|
|
exit_code = aman_benchmarks.eval_models_command(args)
|
|
self.assertEqual(exit_code, 0)
|
|
run_eval_mock.assert_called_once_with(
|
|
"benchmarks/cleanup_dataset.jsonl",
|
|
"benchmarks/model_matrix.small_first.json",
|
|
heuristic_dataset_path="benchmarks/heuristics_dataset.jsonl",
|
|
heuristic_weight=0.35,
|
|
report_version=2,
|
|
verbose=False,
|
|
)
|
|
|
|
def test_build_heuristic_dataset_command_json_output(self):
|
|
args = aman_cli.parse_cli_args(
|
|
[
|
|
"build-heuristic-dataset",
|
|
"--input",
|
|
"benchmarks/heuristics_dataset.raw.jsonl",
|
|
"--output",
|
|
"benchmarks/heuristics_dataset.jsonl",
|
|
"--json",
|
|
]
|
|
)
|
|
out = io.StringIO()
|
|
summary = {
|
|
"raw_rows": 4,
|
|
"written_rows": 4,
|
|
"generated_word_rows": 2,
|
|
"output_path": "benchmarks/heuristics_dataset.jsonl",
|
|
}
|
|
with patch("aman_benchmarks.build_heuristic_dataset", return_value=summary), patch(
|
|
"sys.stdout", out
|
|
):
|
|
exit_code = aman_benchmarks.build_heuristic_dataset_command(args)
|
|
self.assertEqual(exit_code, 0)
|
|
payload = json.loads(out.getvalue())
|
|
self.assertEqual(payload["written_rows"], 4)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|