Add benchmark-driven model promotion workflow and pipeline stages
Some checks failed
ci / test-and-build (push) Has been cancelled
Some checks failed
ci / test-and-build (push) Has been cancelled
This commit is contained in:
parent
98b13d1069
commit
8c1f7c1e13
38 changed files with 5300 additions and 503 deletions
|
|
@ -4,6 +4,7 @@ import sys
|
|||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import patch
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
|
|
@ -92,6 +93,20 @@ class _RetrySetupDesktop(_FakeDesktop):
|
|||
on_quit()
|
||||
|
||||
|
||||
class _FakeBenchEditorStage:
|
||||
def warmup(self):
|
||||
return
|
||||
|
||||
def rewrite(self, transcript, *, language, dictionary_context):
|
||||
_ = dictionary_context
|
||||
return SimpleNamespace(
|
||||
final_text=f"[{language}] {transcript.strip()}",
|
||||
latency_ms=1.0,
|
||||
pass1_ms=0.5,
|
||||
pass2_ms=0.5,
|
||||
)
|
||||
|
||||
|
||||
class AmanCliTests(unittest.TestCase):
|
||||
def test_parse_cli_args_defaults_to_run_command(self):
|
||||
args = aman._parse_cli_args(["--dry-run"])
|
||||
|
|
@ -111,6 +126,85 @@ class AmanCliTests(unittest.TestCase):
|
|||
self.assertEqual(args.command, "self-check")
|
||||
self.assertTrue(args.json)
|
||||
|
||||
def test_parse_cli_args_bench_command(self):
|
||||
args = aman._parse_cli_args(
|
||||
["bench", "--text", "hello", "--repeat", "2", "--warmup", "0", "--json"]
|
||||
)
|
||||
|
||||
self.assertEqual(args.command, "bench")
|
||||
self.assertEqual(args.text, "hello")
|
||||
self.assertEqual(args.repeat, 2)
|
||||
self.assertEqual(args.warmup, 0)
|
||||
self.assertTrue(args.json)
|
||||
|
||||
def test_parse_cli_args_bench_requires_input(self):
|
||||
with self.assertRaises(SystemExit):
|
||||
aman._parse_cli_args(["bench"])
|
||||
|
||||
def test_parse_cli_args_eval_models_command(self):
|
||||
args = aman._parse_cli_args(
|
||||
["eval-models", "--dataset", "benchmarks/cleanup_dataset.jsonl", "--matrix", "benchmarks/model_matrix.small_first.json"]
|
||||
)
|
||||
self.assertEqual(args.command, "eval-models")
|
||||
self.assertEqual(args.dataset, "benchmarks/cleanup_dataset.jsonl")
|
||||
self.assertEqual(args.matrix, "benchmarks/model_matrix.small_first.json")
|
||||
self.assertEqual(args.heuristic_dataset, "")
|
||||
self.assertEqual(args.heuristic_weight, 0.25)
|
||||
self.assertEqual(args.report_version, 2)
|
||||
|
||||
def test_parse_cli_args_eval_models_with_heuristic_options(self):
|
||||
args = aman._parse_cli_args(
|
||||
[
|
||||
"eval-models",
|
||||
"--dataset",
|
||||
"benchmarks/cleanup_dataset.jsonl",
|
||||
"--matrix",
|
||||
"benchmarks/model_matrix.small_first.json",
|
||||
"--heuristic-dataset",
|
||||
"benchmarks/heuristics_dataset.jsonl",
|
||||
"--heuristic-weight",
|
||||
"0.4",
|
||||
"--report-version",
|
||||
"2",
|
||||
]
|
||||
)
|
||||
self.assertEqual(args.heuristic_dataset, "benchmarks/heuristics_dataset.jsonl")
|
||||
self.assertEqual(args.heuristic_weight, 0.4)
|
||||
self.assertEqual(args.report_version, 2)
|
||||
|
||||
def test_parse_cli_args_build_heuristic_dataset_command(self):
|
||||
args = aman._parse_cli_args(
|
||||
[
|
||||
"build-heuristic-dataset",
|
||||
"--input",
|
||||
"benchmarks/heuristics_dataset.raw.jsonl",
|
||||
"--output",
|
||||
"benchmarks/heuristics_dataset.jsonl",
|
||||
]
|
||||
)
|
||||
self.assertEqual(args.command, "build-heuristic-dataset")
|
||||
self.assertEqual(args.input, "benchmarks/heuristics_dataset.raw.jsonl")
|
||||
self.assertEqual(args.output, "benchmarks/heuristics_dataset.jsonl")
|
||||
|
||||
def test_parse_cli_args_sync_default_model_command(self):
|
||||
args = aman._parse_cli_args(
|
||||
[
|
||||
"sync-default-model",
|
||||
"--report",
|
||||
"benchmarks/results/latest.json",
|
||||
"--artifacts",
|
||||
"benchmarks/model_artifacts.json",
|
||||
"--constants",
|
||||
"src/constants.py",
|
||||
"--check",
|
||||
]
|
||||
)
|
||||
self.assertEqual(args.command, "sync-default-model")
|
||||
self.assertEqual(args.report, "benchmarks/results/latest.json")
|
||||
self.assertEqual(args.artifacts, "benchmarks/model_artifacts.json")
|
||||
self.assertEqual(args.constants, "src/constants.py")
|
||||
self.assertTrue(args.check)
|
||||
|
||||
def test_version_command_prints_version(self):
|
||||
out = io.StringIO()
|
||||
args = aman._parse_cli_args(["version"])
|
||||
|
|
@ -145,6 +239,259 @@ class AmanCliTests(unittest.TestCase):
|
|||
self.assertEqual(exit_code, 2)
|
||||
self.assertIn("[FAIL] config.load", out.getvalue())
|
||||
|
||||
def test_bench_command_json_output(self):
|
||||
args = aman._parse_cli_args(["bench", "--text", "hello", "--repeat", "2", "--warmup", "0", "--json"])
|
||||
out = io.StringIO()
|
||||
with patch("aman.load", return_value=Config()), patch(
|
||||
"aman._build_editor_stage", return_value=_FakeBenchEditorStage()
|
||||
), patch("sys.stdout", out):
|
||||
exit_code = aman._bench_command(args)
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
payload = json.loads(out.getvalue())
|
||||
self.assertEqual(payload["measured_runs"], 2)
|
||||
self.assertEqual(payload["summary"]["runs"], 2)
|
||||
self.assertEqual(len(payload["runs"]), 2)
|
||||
self.assertEqual(payload["editor_backend"], "local_llama_builtin")
|
||||
self.assertIn("avg_alignment_ms", payload["summary"])
|
||||
self.assertIn("avg_fact_guard_ms", payload["summary"])
|
||||
self.assertIn("alignment_applied", payload["runs"][0])
|
||||
self.assertIn("fact_guard_action", payload["runs"][0])
|
||||
|
||||
def test_bench_command_supports_text_file_input(self):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
text_file = Path(td) / "input.txt"
|
||||
text_file.write_text("hello from file", encoding="utf-8")
|
||||
args = aman._parse_cli_args(
|
||||
["bench", "--text-file", str(text_file), "--repeat", "1", "--warmup", "0", "--print-output"]
|
||||
)
|
||||
out = io.StringIO()
|
||||
with patch("aman.load", return_value=Config()), patch(
|
||||
"aman._build_editor_stage", return_value=_FakeBenchEditorStage()
|
||||
), patch("sys.stdout", out):
|
||||
exit_code = aman._bench_command(args)
|
||||
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertIn("[auto] hello from file", out.getvalue())
|
||||
|
||||
def test_bench_command_rejects_empty_input(self):
|
||||
args = aman._parse_cli_args(["bench", "--text", " "])
|
||||
with patch("aman.load", return_value=Config()), patch(
|
||||
"aman._build_editor_stage", return_value=_FakeBenchEditorStage()
|
||||
):
|
||||
exit_code = aman._bench_command(args)
|
||||
|
||||
self.assertEqual(exit_code, 1)
|
||||
|
||||
def test_bench_command_rejects_non_positive_repeat(self):
|
||||
args = aman._parse_cli_args(["bench", "--text", "hello", "--repeat", "0"])
|
||||
with patch("aman.load", return_value=Config()), patch(
|
||||
"aman._build_editor_stage", return_value=_FakeBenchEditorStage()
|
||||
):
|
||||
exit_code = aman._bench_command(args)
|
||||
|
||||
self.assertEqual(exit_code, 1)
|
||||
|
||||
def test_eval_models_command_writes_report(self):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
output_path = Path(td) / "report.json"
|
||||
args = aman._parse_cli_args(
|
||||
[
|
||||
"eval-models",
|
||||
"--dataset",
|
||||
"benchmarks/cleanup_dataset.jsonl",
|
||||
"--matrix",
|
||||
"benchmarks/model_matrix.small_first.json",
|
||||
"--output",
|
||||
str(output_path),
|
||||
"--json",
|
||||
]
|
||||
)
|
||||
out = io.StringIO()
|
||||
fake_report = {
|
||||
"models": [{"name": "base", "best_param_set": {"latency_ms": {"p50": 1000.0}, "quality": {"hybrid_score_avg": 0.8, "parse_valid_rate": 1.0}}}],
|
||||
"winner_recommendation": {"name": "base", "reason": "test"},
|
||||
}
|
||||
with patch("aman.run_model_eval", return_value=fake_report), patch("sys.stdout", out):
|
||||
exit_code = aman._eval_models_command(args)
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertTrue(output_path.exists())
|
||||
payload = json.loads(output_path.read_text(encoding="utf-8"))
|
||||
self.assertEqual(payload["winner_recommendation"]["name"], "base")
|
||||
|
||||
def test_eval_models_command_forwards_heuristic_arguments(self):
|
||||
args = aman._parse_cli_args(
|
||||
[
|
||||
"eval-models",
|
||||
"--dataset",
|
||||
"benchmarks/cleanup_dataset.jsonl",
|
||||
"--matrix",
|
||||
"benchmarks/model_matrix.small_first.json",
|
||||
"--heuristic-dataset",
|
||||
"benchmarks/heuristics_dataset.jsonl",
|
||||
"--heuristic-weight",
|
||||
"0.35",
|
||||
"--report-version",
|
||||
"2",
|
||||
"--json",
|
||||
]
|
||||
)
|
||||
out = io.StringIO()
|
||||
fake_report = {
|
||||
"models": [{"name": "base", "best_param_set": {}}],
|
||||
"winner_recommendation": {"name": "base", "reason": "ok"},
|
||||
}
|
||||
with patch("aman.run_model_eval", return_value=fake_report) as run_eval_mock, patch(
|
||||
"sys.stdout", out
|
||||
):
|
||||
exit_code = aman._eval_models_command(args)
|
||||
self.assertEqual(exit_code, 0)
|
||||
run_eval_mock.assert_called_once_with(
|
||||
"benchmarks/cleanup_dataset.jsonl",
|
||||
"benchmarks/model_matrix.small_first.json",
|
||||
heuristic_dataset_path="benchmarks/heuristics_dataset.jsonl",
|
||||
heuristic_weight=0.35,
|
||||
report_version=2,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
def test_build_heuristic_dataset_command_json_output(self):
|
||||
args = aman._parse_cli_args(
|
||||
[
|
||||
"build-heuristic-dataset",
|
||||
"--input",
|
||||
"benchmarks/heuristics_dataset.raw.jsonl",
|
||||
"--output",
|
||||
"benchmarks/heuristics_dataset.jsonl",
|
||||
"--json",
|
||||
]
|
||||
)
|
||||
out = io.StringIO()
|
||||
summary = {
|
||||
"raw_rows": 4,
|
||||
"written_rows": 4,
|
||||
"generated_word_rows": 2,
|
||||
"output_path": "benchmarks/heuristics_dataset.jsonl",
|
||||
}
|
||||
with patch("aman.build_heuristic_dataset", return_value=summary), patch("sys.stdout", out):
|
||||
exit_code = aman._build_heuristic_dataset_command(args)
|
||||
self.assertEqual(exit_code, 0)
|
||||
payload = json.loads(out.getvalue())
|
||||
self.assertEqual(payload["written_rows"], 4)
|
||||
|
||||
def test_sync_default_model_command_updates_constants(self):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
report_path = Path(td) / "latest.json"
|
||||
artifacts_path = Path(td) / "artifacts.json"
|
||||
constants_path = Path(td) / "constants.py"
|
||||
report_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"winner_recommendation": {
|
||||
"name": "test-model",
|
||||
}
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
artifacts_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"models": [
|
||||
{
|
||||
"name": "test-model",
|
||||
"filename": "winner.gguf",
|
||||
"url": "https://example.invalid/winner.gguf",
|
||||
"sha256": "a" * 64,
|
||||
}
|
||||
]
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
constants_path.write_text(
|
||||
(
|
||||
'MODEL_NAME = "old.gguf"\n'
|
||||
'MODEL_URL = "https://example.invalid/old.gguf"\n'
|
||||
'MODEL_SHA256 = "' + ("b" * 64) + '"\n'
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
args = aman._parse_cli_args(
|
||||
[
|
||||
"sync-default-model",
|
||||
"--report",
|
||||
str(report_path),
|
||||
"--artifacts",
|
||||
str(artifacts_path),
|
||||
"--constants",
|
||||
str(constants_path),
|
||||
]
|
||||
)
|
||||
exit_code = aman._sync_default_model_command(args)
|
||||
self.assertEqual(exit_code, 0)
|
||||
updated = constants_path.read_text(encoding="utf-8")
|
||||
self.assertIn('MODEL_NAME = "winner.gguf"', updated)
|
||||
self.assertIn('MODEL_URL = "https://example.invalid/winner.gguf"', updated)
|
||||
self.assertIn('MODEL_SHA256 = "' + ("a" * 64) + '"', updated)
|
||||
|
||||
def test_sync_default_model_command_check_mode_returns_2_on_drift(self):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
report_path = Path(td) / "latest.json"
|
||||
artifacts_path = Path(td) / "artifacts.json"
|
||||
constants_path = Path(td) / "constants.py"
|
||||
report_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"winner_recommendation": {
|
||||
"name": "test-model",
|
||||
}
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
artifacts_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"models": [
|
||||
{
|
||||
"name": "test-model",
|
||||
"filename": "winner.gguf",
|
||||
"url": "https://example.invalid/winner.gguf",
|
||||
"sha256": "a" * 64,
|
||||
}
|
||||
]
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
constants_path.write_text(
|
||||
(
|
||||
'MODEL_NAME = "old.gguf"\n'
|
||||
'MODEL_URL = "https://example.invalid/old.gguf"\n'
|
||||
'MODEL_SHA256 = "' + ("b" * 64) + '"\n'
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
args = aman._parse_cli_args(
|
||||
[
|
||||
"sync-default-model",
|
||||
"--report",
|
||||
str(report_path),
|
||||
"--artifacts",
|
||||
str(artifacts_path),
|
||||
"--constants",
|
||||
str(constants_path),
|
||||
"--check",
|
||||
]
|
||||
)
|
||||
exit_code = aman._sync_default_model_command(args)
|
||||
self.assertEqual(exit_code, 2)
|
||||
updated = constants_path.read_text(encoding="utf-8")
|
||||
self.assertIn('MODEL_NAME = "old.gguf"', updated)
|
||||
|
||||
def test_init_command_creates_default_config(self):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
path = Path(td) / "config.json"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue