Add benchmark-driven model promotion workflow and pipeline stages
Some checks failed
ci / test-and-build (push) Has been cancelled
Some checks failed
ci / test-and-build (push) Has been cancelled
This commit is contained in:
parent
98b13d1069
commit
8c1f7c1e13
38 changed files with 5300 additions and 503 deletions
137
tests/test_model_eval.py
Normal file
137
tests/test_model_eval.py
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
import json
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
SRC = ROOT / "src"
|
||||
if str(SRC) not in sys.path:
|
||||
sys.path.insert(0, str(SRC))
|
||||
|
||||
import model_eval
|
||||
|
||||
|
||||
class _FakeProcessor:
|
||||
def __init__(self, *args, **kwargs):
|
||||
_ = (args, kwargs)
|
||||
|
||||
def warmup(self, **kwargs):
|
||||
_ = kwargs
|
||||
return
|
||||
|
||||
def process(self, text, **kwargs):
|
||||
_ = kwargs
|
||||
return text.strip()
|
||||
|
||||
|
||||
class ModelEvalTests(unittest.TestCase):
|
||||
def test_load_eval_dataset_validates_required_fields(self):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
dataset = Path(td) / "dataset.jsonl"
|
||||
dataset.write_text(
|
||||
'{"id":"c1","input_text":"hello","expected_output":"hello"}\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
cases = model_eval.load_eval_dataset(dataset)
|
||||
self.assertEqual(len(cases), 1)
|
||||
self.assertEqual(cases[0].case_id, "c1")
|
||||
|
||||
def test_run_model_eval_produces_report(self):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
model_file = Path(td) / "fake.gguf"
|
||||
model_file.write_text("fake", encoding="utf-8")
|
||||
dataset = Path(td) / "dataset.jsonl"
|
||||
dataset.write_text(
|
||||
(
|
||||
'{"id":"c1","input_text":"hello world","expected_output":"hello world"}\n'
|
||||
'{"id":"c2","input_text":"hello","expected_output":"hello"}\n'
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
matrix = Path(td) / "matrix.json"
|
||||
heuristic_dataset = Path(td) / "heuristics.jsonl"
|
||||
matrix.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"warmup_runs": 0,
|
||||
"measured_runs": 1,
|
||||
"timeout_sec": 30,
|
||||
"baseline_model": {
|
||||
"name": "base",
|
||||
"provider": "local_llama",
|
||||
"model_path": str(model_file),
|
||||
"profile": "default",
|
||||
"param_grid": {"temperature": [0.0]},
|
||||
},
|
||||
"candidate_models": [
|
||||
{
|
||||
"name": "small",
|
||||
"provider": "local_llama",
|
||||
"model_path": str(model_file),
|
||||
"profile": "fast",
|
||||
"param_grid": {"temperature": [0.0, 0.1], "max_tokens": [96]},
|
||||
}
|
||||
],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
heuristic_dataset.write_text(
|
||||
(
|
||||
'{"id":"h1","transcript":"set alarm for 6 i mean 7","words":[{"text":"set","start_s":0.0,"end_s":0.1,"prob":0.9},{"text":"alarm","start_s":0.2,"end_s":0.3,"prob":0.9},{"text":"for","start_s":0.4,"end_s":0.5,"prob":0.9},{"text":"6","start_s":0.6,"end_s":0.7,"prob":0.9},{"text":"i","start_s":0.8,"end_s":0.9,"prob":0.9},{"text":"mean","start_s":1.0,"end_s":1.1,"prob":0.9},{"text":"7","start_s":1.2,"end_s":1.3,"prob":0.9}],"expected_aligned_text":"set alarm for 7","expected":{"applied_min":1,"required_rule_ids":["cue_correction"]},"tags":["i_mean_correction"]}\n'
|
||||
'{"id":"h2","transcript":"write exactly i mean this sincerely","words":[{"text":"write","start_s":0.0,"end_s":0.1,"prob":0.9},{"text":"exactly","start_s":0.2,"end_s":0.3,"prob":0.9},{"text":"i","start_s":0.4,"end_s":0.5,"prob":0.9},{"text":"mean","start_s":0.6,"end_s":0.7,"prob":0.9},{"text":"this","start_s":0.8,"end_s":0.9,"prob":0.9},{"text":"sincerely","start_s":1.0,"end_s":1.1,"prob":0.9}],"expected_aligned_text":"write exactly i mean this sincerely","expected":{"required_rule_ids":[],"forbidden_rule_ids":["cue_correction"]},"tags":["i_mean_literal"]}\n'
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
with patch("model_eval.LlamaProcessor", _FakeProcessor):
|
||||
report = model_eval.run_model_eval(
|
||||
dataset,
|
||||
matrix,
|
||||
heuristic_dataset_path=heuristic_dataset,
|
||||
heuristic_weight=0.3,
|
||||
report_version=2,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
self.assertEqual(report["report_version"], 2)
|
||||
self.assertIn("models", report)
|
||||
self.assertEqual(len(report["models"]), 2)
|
||||
self.assertIn("winner_recommendation", report)
|
||||
self.assertIn("heuristic_eval", report)
|
||||
self.assertEqual(report["heuristic_eval"]["cases"], 2)
|
||||
self.assertIn("combined_score", report["models"][0]["best_param_set"])
|
||||
summary = model_eval.format_model_eval_summary(report)
|
||||
self.assertIn("model eval summary", summary)
|
||||
|
||||
def test_load_heuristic_dataset_validates_required_fields(self):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
dataset = Path(td) / "heuristics.jsonl"
|
||||
dataset.write_text(
|
||||
'{"id":"h1","transcript":"hello world","words":[{"text":"hello","start_s":0.0,"end_s":0.1},{"text":"world","start_s":0.2,"end_s":0.3}],"expected_aligned_text":"hello world"}\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
cases = model_eval.load_heuristic_dataset(dataset)
|
||||
self.assertEqual(len(cases), 1)
|
||||
self.assertEqual(cases[0].case_id, "h1")
|
||||
self.assertEqual(cases[0].expected.applied_min, 0)
|
||||
|
||||
def test_build_heuristic_dataset_generates_words_when_missing(self):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
source = Path(td) / "heuristics.raw.jsonl"
|
||||
output = Path(td) / "heuristics.jsonl"
|
||||
source.write_text(
|
||||
'{"id":"h1","transcript":"please send it","expected_aligned_text":"please send it","expected":{"applied_min":0}}\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
summary = model_eval.build_heuristic_dataset(source, output)
|
||||
self.assertEqual(summary["written_rows"], 1)
|
||||
self.assertEqual(summary["generated_word_rows"], 1)
|
||||
loaded = model_eval.load_heuristic_dataset(output)
|
||||
self.assertEqual(len(loaded), 1)
|
||||
self.assertGreaterEqual(len(loaded[0].words), 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue