Stop shipping code that implied Aman supported a two-pass editor, external API cleanup, or a Wayland scaffold when the runtime only exercises single-pass local cleanup on X11.\n\nCollapse aiprocess to the active single-pass Llama contract, delete desktop_wayland and the empty wayland extra, and make model_eval reject pass1_/pass2_ tuning keys while keeping pass1_ms/pass2_ms as report compatibility fields.\n\nRemove the unused pillow dependency, switch to SPDX-style license metadata, and clean setuptools build state before packaging so deleted modules do not leak into wheels. Update the methodology and repo guidance docs, and add focused tests for desktop adapter selection, stale param rejection, and portable wheel contents.\n\nValidate with uv lock, python3 -m unittest discover -s tests -p 'test_*.py', python3 -m py_compile src/*.py tests/*.py, and python3 -m build --wheel --sdist --no-isolation.
164 lines
7.4 KiB
Python
164 lines
7.4 KiB
Python
import json
|
|
import sys
|
|
import tempfile
|
|
import unittest
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
SRC = ROOT / "src"
|
|
if str(SRC) not in sys.path:
|
|
sys.path.insert(0, str(SRC))
|
|
|
|
import model_eval
|
|
|
|
|
|
class _FakeProcessor:
|
|
def __init__(self, *args, **kwargs):
|
|
_ = (args, kwargs)
|
|
|
|
def warmup(self, **kwargs):
|
|
_ = kwargs
|
|
return
|
|
|
|
def process(self, text, **kwargs):
|
|
_ = kwargs
|
|
return text.strip()
|
|
|
|
|
|
class ModelEvalTests(unittest.TestCase):
|
|
def test_load_eval_dataset_validates_required_fields(self):
|
|
with tempfile.TemporaryDirectory() as td:
|
|
dataset = Path(td) / "dataset.jsonl"
|
|
dataset.write_text(
|
|
'{"id":"c1","input_text":"hello","expected_output":"hello"}\n',
|
|
encoding="utf-8",
|
|
)
|
|
cases = model_eval.load_eval_dataset(dataset)
|
|
self.assertEqual(len(cases), 1)
|
|
self.assertEqual(cases[0].case_id, "c1")
|
|
|
|
def test_run_model_eval_produces_report(self):
|
|
with tempfile.TemporaryDirectory() as td:
|
|
model_file = Path(td) / "fake.gguf"
|
|
model_file.write_text("fake", encoding="utf-8")
|
|
dataset = Path(td) / "dataset.jsonl"
|
|
dataset.write_text(
|
|
(
|
|
'{"id":"c1","input_text":"hello world","expected_output":"hello world"}\n'
|
|
'{"id":"c2","input_text":"hello","expected_output":"hello"}\n'
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
matrix = Path(td) / "matrix.json"
|
|
heuristic_dataset = Path(td) / "heuristics.jsonl"
|
|
matrix.write_text(
|
|
json.dumps(
|
|
{
|
|
"warmup_runs": 0,
|
|
"measured_runs": 1,
|
|
"timeout_sec": 30,
|
|
"baseline_model": {
|
|
"name": "base",
|
|
"provider": "local_llama",
|
|
"model_path": str(model_file),
|
|
"profile": "default",
|
|
"param_grid": {"temperature": [0.0]},
|
|
},
|
|
"candidate_models": [
|
|
{
|
|
"name": "small",
|
|
"provider": "local_llama",
|
|
"model_path": str(model_file),
|
|
"profile": "fast",
|
|
"param_grid": {"temperature": [0.0, 0.1], "max_tokens": [96]},
|
|
}
|
|
],
|
|
}
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
heuristic_dataset.write_text(
|
|
(
|
|
'{"id":"h1","transcript":"set alarm for 6 i mean 7","words":[{"text":"set","start_s":0.0,"end_s":0.1,"prob":0.9},{"text":"alarm","start_s":0.2,"end_s":0.3,"prob":0.9},{"text":"for","start_s":0.4,"end_s":0.5,"prob":0.9},{"text":"6","start_s":0.6,"end_s":0.7,"prob":0.9},{"text":"i","start_s":0.8,"end_s":0.9,"prob":0.9},{"text":"mean","start_s":1.0,"end_s":1.1,"prob":0.9},{"text":"7","start_s":1.2,"end_s":1.3,"prob":0.9}],"expected_aligned_text":"set alarm for 7","expected":{"applied_min":1,"required_rule_ids":["cue_correction"]},"tags":["i_mean_correction"]}\n'
|
|
'{"id":"h2","transcript":"write exactly i mean this sincerely","words":[{"text":"write","start_s":0.0,"end_s":0.1,"prob":0.9},{"text":"exactly","start_s":0.2,"end_s":0.3,"prob":0.9},{"text":"i","start_s":0.4,"end_s":0.5,"prob":0.9},{"text":"mean","start_s":0.6,"end_s":0.7,"prob":0.9},{"text":"this","start_s":0.8,"end_s":0.9,"prob":0.9},{"text":"sincerely","start_s":1.0,"end_s":1.1,"prob":0.9}],"expected_aligned_text":"write exactly i mean this sincerely","expected":{"required_rule_ids":[],"forbidden_rule_ids":["cue_correction"]},"tags":["i_mean_literal"]}\n'
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
with patch("model_eval.LlamaProcessor", _FakeProcessor):
|
|
report = model_eval.run_model_eval(
|
|
dataset,
|
|
matrix,
|
|
heuristic_dataset_path=heuristic_dataset,
|
|
heuristic_weight=0.3,
|
|
report_version=2,
|
|
verbose=False,
|
|
)
|
|
|
|
self.assertEqual(report["report_version"], 2)
|
|
self.assertIn("models", report)
|
|
self.assertEqual(len(report["models"]), 2)
|
|
self.assertIn("winner_recommendation", report)
|
|
self.assertIn("heuristic_eval", report)
|
|
self.assertEqual(report["heuristic_eval"]["cases"], 2)
|
|
self.assertIn("combined_score", report["models"][0]["best_param_set"])
|
|
summary = model_eval.format_model_eval_summary(report)
|
|
self.assertIn("model eval summary", summary)
|
|
|
|
def test_load_eval_matrix_rejects_stale_pass_prefixed_param_keys(self):
|
|
with tempfile.TemporaryDirectory() as td:
|
|
model_file = Path(td) / "fake.gguf"
|
|
model_file.write_text("fake", encoding="utf-8")
|
|
matrix = Path(td) / "matrix.json"
|
|
matrix.write_text(
|
|
json.dumps(
|
|
{
|
|
"warmup_runs": 0,
|
|
"measured_runs": 1,
|
|
"timeout_sec": 30,
|
|
"baseline_model": {
|
|
"name": "base",
|
|
"provider": "local_llama",
|
|
"model_path": str(model_file),
|
|
"profile": "default",
|
|
"param_grid": {"pass1_temperature": [0.0]},
|
|
},
|
|
"candidate_models": [],
|
|
}
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
with self.assertRaisesRegex(RuntimeError, "unsupported param_grid key 'pass1_temperature'"):
|
|
model_eval.load_eval_matrix(matrix)
|
|
|
|
def test_load_heuristic_dataset_validates_required_fields(self):
|
|
with tempfile.TemporaryDirectory() as td:
|
|
dataset = Path(td) / "heuristics.jsonl"
|
|
dataset.write_text(
|
|
'{"id":"h1","transcript":"hello world","words":[{"text":"hello","start_s":0.0,"end_s":0.1},{"text":"world","start_s":0.2,"end_s":0.3}],"expected_aligned_text":"hello world"}\n',
|
|
encoding="utf-8",
|
|
)
|
|
cases = model_eval.load_heuristic_dataset(dataset)
|
|
self.assertEqual(len(cases), 1)
|
|
self.assertEqual(cases[0].case_id, "h1")
|
|
self.assertEqual(cases[0].expected.applied_min, 0)
|
|
|
|
def test_build_heuristic_dataset_generates_words_when_missing(self):
|
|
with tempfile.TemporaryDirectory() as td:
|
|
source = Path(td) / "heuristics.raw.jsonl"
|
|
output = Path(td) / "heuristics.jsonl"
|
|
source.write_text(
|
|
'{"id":"h1","transcript":"please send it","expected_aligned_text":"please send it","expected":{"applied_min":0}}\n',
|
|
encoding="utf-8",
|
|
)
|
|
summary = model_eval.build_heuristic_dataset(source, output)
|
|
self.assertEqual(summary["written_rows"], 1)
|
|
self.assertEqual(summary["generated_word_rows"], 1)
|
|
loaded = model_eval.load_heuristic_dataset(output)
|
|
self.assertEqual(len(loaded), 1)
|
|
self.assertGreaterEqual(len(loaded[0].words), 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|