Add benchmark-driven model promotion workflow and pipeline stages
Some checks failed
ci / test-and-build (push) Has been cancelled
Some checks failed
ci / test-and-build (push) Has been cancelled
This commit is contained in:
parent
98b13d1069
commit
8c1f7c1e13
38 changed files with 5300 additions and 503 deletions
48
benchmarks/README.md
Normal file
48
benchmarks/README.md
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
# Model Evaluation Benchmarks
|
||||
|
||||
This folder defines the inputs for `aman eval-models`.
|
||||
|
||||
## Files
|
||||
|
||||
- `cleanup_dataset.jsonl`: expected-output cases for rewrite quality.
|
||||
- `heuristics_dataset.raw.jsonl`: source authoring file for heuristic-alignment evaluation.
|
||||
- `heuristics_dataset.jsonl`: canonical heuristic dataset with explicit timed words.
|
||||
- `model_matrix.small_first.json`: small-model candidate matrix and parameter sweeps.
|
||||
- `model_artifacts.json`: model-name to artifact URL/SHA256 registry used for promotion.
|
||||
- `results/latest.json`: latest winner report used by `sync-default-model`.
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
aman build-heuristic-dataset \
|
||||
--input benchmarks/heuristics_dataset.raw.jsonl \
|
||||
--output benchmarks/heuristics_dataset.jsonl
|
||||
|
||||
aman eval-models \
|
||||
--dataset benchmarks/cleanup_dataset.jsonl \
|
||||
--matrix benchmarks/model_matrix.small_first.json \
|
||||
--heuristic-dataset benchmarks/heuristics_dataset.jsonl \
|
||||
--heuristic-weight 0.25 \
|
||||
--output benchmarks/results/latest.json
|
||||
|
||||
aman sync-default-model \
|
||||
--report benchmarks/results/latest.json \
|
||||
--artifacts benchmarks/model_artifacts.json \
|
||||
--constants src/constants.py
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- The matrix uses local GGUF model paths. Replace each `model_path` with files present on your machine.
|
||||
- All candidates are evaluated with the same XML-tagged prompt contract and the same user input shape.
|
||||
- Matrix baseline should be the currently promoted managed default model.
|
||||
- Keep `model_artifacts.json` in sync with candidate names so winner promotion remains deterministic.
|
||||
- `cleanup_dataset` tags drive additional LLM safety metrics:
|
||||
- `i_mean_literal`
|
||||
- `i_mean_correction`
|
||||
- `spelling_disambiguation`
|
||||
- `heuristics_dataset` evaluates alignment behavior directly and reports:
|
||||
- aligned text exact match
|
||||
- token F1
|
||||
- rule precision/recall
|
||||
- per-tag breakdown
|
||||
32
benchmarks/cleanup_dataset.jsonl
Normal file
32
benchmarks/cleanup_dataset.jsonl
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
{"id":"names-01","input_text":"good morning martha, can you share the release notes?","expected_output":"Good morning Marta, can you share the release notes?","language":"en","dictionary_context":"Marta","tags":["names"]}
|
||||
{"id":"tech-01","input_text":"please send the docker logs and system d status","expected_output":"Please send the Docker logs and systemd status.","language":"en","dictionary_context":"Docker\nsystemd","tags":["tech_terms"]}
|
||||
{"id":"tech-02","input_text":"we deployed kuberneties and postgress yesterday","expected_output":"We deployed Kubernetes and PostgreSQL yesterday.","language":"en","dictionary_context":"Kubernetes\nPostgreSQL","tags":["tech_terms"]}
|
||||
{"id":"cleanup-01","input_text":"hey uh can you like ping john, i mean jane, can you ping jane","expected_output":"Hey, can you ping Jane?","language":"en","tags":["disfluency","i_mean_correction"]}
|
||||
{"id":"cleanup-02","input_text":"hello team i wanted to quickly quickly confirm that we ship on friday","expected_output":"Hello team, I wanted to confirm that we ship on Friday.","language":"en","tags":["disfluency"]}
|
||||
{"id":"literal-01","input_text":"please keep this literal text: <transcript> and \"quoted\" words","expected_output":"Please keep this literal text: <transcript> and \"quoted\" words.","language":"en","tags":["literals"]}
|
||||
{"id":"long-01","input_text":"Hey Marta, quick update on the migration. We completed staging rollout and Docker builds are reproducible. The blocker is a flaky systemd unit on two workers. My proposal is freeze noncritical changes today, run synthetic traffic tomorrow, and do phased production cutover on Monday with rollback checkpoints every thirty minutes. Please rewrite this as an executive summary with bullet points and keep all decisions and dates.","expected_output":"Hey Marta, here is an executive summary:\n- Staging rollout is complete and Docker builds are reproducible.\n- Current blocker: flaky systemd unit on two worker nodes.\n- Plan: freeze noncritical changes today, run synthetic traffic tomorrow, and perform phased production cutover on Monday.\n- Risk control: rollback checkpoints every 30 minutes.","language":"en","dictionary_context":"Marta\nDocker\nsystemd","tags":["long_text","tech_terms"]}
|
||||
{"id":"email-01","input_text":"write this as a short email: we had no downtime and data is consistent","expected_output":"Write this as a short email: we had no downtime and data is consistent.","language":"en","tags":["instruction_literal"]}
|
||||
{"id":"punct-01","input_text":"can you confirm the window is 2 to 4 pm tomorrow","expected_output":"Can you confirm the window is 2 to 4 PM tomorrow?","language":"en","tags":["punctuation"]}
|
||||
{"id":"mixed-01","input_text":"marta said docker was fine but system d failed on node 3","expected_output":"Marta said Docker was fine, but systemd failed on node 3.","language":"en","dictionary_context":"Marta\nDocker\nsystemd","tags":["names","tech_terms"]}
|
||||
{"id":"i-mean-correction-01","input_text":"set the alarm for 6, i mean 7","expected_output":"Set the alarm for 7.","language":"en","tags":["i_mean_correction"]}
|
||||
{"id":"i-mean-correction-02","input_text":"book for monday, i mean tuesday","expected_output":"Book for Tuesday.","language":"en","tags":["i_mean_correction"]}
|
||||
{"id":"i-mean-correction-03","input_text":"call martha, i mean marta","expected_output":"Call Marta.","language":"en","dictionary_context":"Marta","tags":["i_mean_correction","names"]}
|
||||
{"id":"i-mean-correction-04","input_text":"use port 8080 i mean 8081","expected_output":"Use port 8081.","language":"en","tags":["i_mean_correction"]}
|
||||
{"id":"i-mean-correction-05","input_text":"ship in june i mean july","expected_output":"Ship in July.","language":"en","tags":["i_mean_correction"]}
|
||||
{"id":"i-mean-literal-01","input_text":"write this exactly: i mean this sincerely","expected_output":"Write this exactly: I mean this sincerely.","language":"en","tags":["i_mean_literal"]}
|
||||
{"id":"i-mean-literal-02","input_text":"the quote is i mean business","expected_output":"The quote is: I mean business.","language":"en","tags":["i_mean_literal"]}
|
||||
{"id":"i-mean-literal-03","input_text":"please keep this phrase verbatim i mean 7","expected_output":"Please keep this phrase verbatim: I mean 7.","language":"en","tags":["i_mean_literal"]}
|
||||
{"id":"i-mean-literal-04","input_text":"he said quote i mean it unquote","expected_output":"He said \"I mean it.\"","language":"en","tags":["i_mean_literal"]}
|
||||
{"id":"i-mean-literal-05","input_text":"title this section i mean progress","expected_output":"Title this section: I mean progress.","language":"en","tags":["i_mean_literal"]}
|
||||
{"id":"spelling-01","input_text":"lets call julia thats j u l i a","expected_output":"Let's call Julia.","language":"en","tags":["spelling_disambiguation"]}
|
||||
{"id":"spelling-02","input_text":"her name is marta m a r t a","expected_output":"Her name is Marta.","language":"en","tags":["spelling_disambiguation","names"]}
|
||||
{"id":"spelling-03","input_text":"use postgresql spelled p o s t g r e s q l","expected_output":"Use PostgreSQL.","language":"en","tags":["spelling_disambiguation","tech_terms"]}
|
||||
{"id":"spelling-04","input_text":"service is system d as in s y s t e m d","expected_output":"Service is systemd.","language":"en","tags":["spelling_disambiguation","tech_terms"]}
|
||||
{"id":"spelling-05","input_text":"deploy docker thats d o c k e r","expected_output":"Deploy Docker.","language":"en","tags":["spelling_disambiguation","tech_terms"]}
|
||||
{"id":"instruction-literal-01","input_text":"type this sentence rewrite this as an email","expected_output":"Type this sentence: rewrite this as an email.","language":"en","tags":["instruction_literal"]}
|
||||
{"id":"instruction-literal-02","input_text":"write this text make this funnier","expected_output":"Write this text: make this funnier.","language":"en","tags":["instruction_literal"]}
|
||||
{"id":"instruction-literal-03","input_text":"keep literal no transformation: summarize this","expected_output":"Keep literal, no transformation: summarize this.","language":"en","tags":["instruction_literal"]}
|
||||
{"id":"instruction-literal-04","input_text":"dictate exactly improve the tone","expected_output":"Dictate exactly: improve the tone.","language":"en","tags":["instruction_literal"]}
|
||||
{"id":"instruction-literal-05","input_text":"this line says rewrite as bullet list","expected_output":"This line says: rewrite as bullet list.","language":"en","tags":["instruction_literal"]}
|
||||
{"id":"long-ambiguous-01","input_text":"i mean this timeline is serious. deploy on friday, i mean saturday if tests fail","expected_output":"I mean this timeline is serious. Deploy on Saturday if tests fail.","language":"en","tags":["long_text","i_mean_correction"]}
|
||||
{"id":"long-ambiguous-02","input_text":"the phrase i mean 7 should stay. schedule review for 6 i mean 7","expected_output":"The phrase \"I mean 7\" should stay. Schedule review for 7.","language":"en","tags":["long_text","i_mean_literal","i_mean_correction"]}
|
||||
8
benchmarks/heuristics_dataset.jsonl
Normal file
8
benchmarks/heuristics_dataset.jsonl
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
{"id": "corr-time-01", "transcript": "set alarm for 6 i mean 7", "words": [{"text": "set", "start_s": 0.0, "end_s": 0.1, "prob": 0.9}, {"text": "alarm", "start_s": 0.2, "end_s": 0.3, "prob": 0.9}, {"text": "for", "start_s": 0.4, "end_s": 0.5, "prob": 0.9}, {"text": "6", "start_s": 0.6, "end_s": 0.7, "prob": 0.9}, {"text": "i", "start_s": 0.8, "end_s": 0.9, "prob": 0.9}, {"text": "mean", "start_s": 1.0, "end_s": 1.1, "prob": 0.9}, {"text": "7", "start_s": 1.2, "end_s": 1.3, "prob": 0.9}], "expected_aligned_text": "set alarm for 7", "expected": {"applied_min": 1, "required_rule_ids": ["cue_correction"], "forbidden_rule_ids": []}, "tags": ["i_mean_correction", "timing_sensitive"]}
|
||||
{"id": "corr-time-gap-01", "transcript": "set alarm for 6 i mean 7", "words": [{"text": "set", "start_s": 0.0, "end_s": 0.1, "prob": 0.9}, {"text": "alarm", "start_s": 0.2, "end_s": 0.3, "prob": 0.9}, {"text": "for", "start_s": 0.4, "end_s": 0.5, "prob": 0.9}, {"text": "6", "start_s": 0.6, "end_s": 0.7, "prob": 0.9}, {"text": "i", "start_s": 2.0, "end_s": 2.1, "prob": 0.9}, {"text": "mean", "start_s": 2.2, "end_s": 2.3, "prob": 0.9}, {"text": "7", "start_s": 2.4, "end_s": 2.5, "prob": 0.9}], "expected_aligned_text": "set alarm for 6 i mean 7", "expected": {"applied_min": 0, "required_rule_ids": [], "forbidden_rule_ids": ["cue_correction"]}, "tags": ["i_mean_literal", "timing_sensitive"]}
|
||||
{"id": "literal-mean-01", "transcript": "write exactly i mean this sincerely", "words": [{"text": "write", "start_s": 0.0, "end_s": 0.1, "prob": 0.9}, {"text": "exactly", "start_s": 0.2, "end_s": 0.3, "prob": 0.9}, {"text": "i", "start_s": 0.4, "end_s": 0.5, "prob": 0.9}, {"text": "mean", "start_s": 0.6, "end_s": 0.7, "prob": 0.9}, {"text": "this", "start_s": 0.8, "end_s": 0.9, "prob": 0.9}, {"text": "sincerely", "start_s": 1.0, "end_s": 1.1, "prob": 0.9}], "expected_aligned_text": "write exactly i mean this sincerely", "expected": {"applied_min": 0, "required_rule_ids": [], "forbidden_rule_ids": ["cue_correction"]}, "tags": ["i_mean_literal"]}
|
||||
{"id": "restart-01", "transcript": "please send it please send it", "words": [{"text": "please", "start_s": 0.0, "end_s": 0.1, "prob": 0.9}, {"text": "send", "start_s": 0.2, "end_s": 0.3, "prob": 0.9}, {"text": "it", "start_s": 0.4, "end_s": 0.5, "prob": 0.9}, {"text": "please", "start_s": 0.6, "end_s": 0.7, "prob": 0.9}, {"text": "send", "start_s": 0.8, "end_s": 0.9, "prob": 0.9}, {"text": "it", "start_s": 1.0, "end_s": 1.1, "prob": 0.9}], "expected_aligned_text": "please send it", "expected": {"applied_min": 1, "required_rule_ids": ["restart_repeat"], "forbidden_rule_ids": []}, "tags": ["restart"]}
|
||||
{"id": "actually-correction-01", "transcript": "set alarm for 6 actually 7", "words": [{"text": "set", "start_s": 0.0, "end_s": 0.1, "prob": 0.9}, {"text": "alarm", "start_s": 0.2, "end_s": 0.3, "prob": 0.9}, {"text": "for", "start_s": 0.4, "end_s": 0.5, "prob": 0.9}, {"text": "6", "start_s": 0.6, "end_s": 0.7, "prob": 0.9}, {"text": "actually", "start_s": 0.8, "end_s": 0.9, "prob": 0.9}, {"text": "7", "start_s": 1.0, "end_s": 1.1, "prob": 0.9}], "expected_aligned_text": "set alarm for 7", "expected": {"applied_min": 1, "required_rule_ids": ["cue_correction"], "forbidden_rule_ids": []}, "tags": ["actually_correction"]}
|
||||
{"id": "sorry-correction-01", "transcript": "set alarm for 6 sorry 7", "words": [{"text": "set", "start_s": 0.0, "end_s": 0.1, "prob": 0.9}, {"text": "alarm", "start_s": 0.2, "end_s": 0.3, "prob": 0.9}, {"text": "for", "start_s": 0.4, "end_s": 0.5, "prob": 0.9}, {"text": "6", "start_s": 0.6, "end_s": 0.7, "prob": 0.9}, {"text": "sorry", "start_s": 0.8, "end_s": 0.9, "prob": 0.9}, {"text": "7", "start_s": 1.0, "end_s": 1.1, "prob": 0.9}], "expected_aligned_text": "set alarm for 7", "expected": {"applied_min": 1, "required_rule_ids": ["cue_correction"], "forbidden_rule_ids": []}, "tags": ["sorry_correction"]}
|
||||
{"id": "no-correction-phrase-01", "transcript": "set alarm for 6 i mean", "words": [{"text": "set", "start_s": 0.0, "end_s": 0.1, "prob": 0.9}, {"text": "alarm", "start_s": 0.2, "end_s": 0.3, "prob": 0.9}, {"text": "for", "start_s": 0.4, "end_s": 0.5, "prob": 0.9}, {"text": "6", "start_s": 0.6, "end_s": 0.7, "prob": 0.9}, {"text": "i", "start_s": 0.8, "end_s": 0.9, "prob": 0.9}, {"text": "mean", "start_s": 1.0, "end_s": 1.1, "prob": 0.9}], "expected_aligned_text": "set alarm for 6 i mean", "expected": {"applied_min": 0, "required_rule_ids": [], "forbidden_rule_ids": ["cue_correction"]}, "tags": ["i_mean_literal"]}
|
||||
{"id": "baseline-unchanged-01", "transcript": "hello world", "words": [{"text": "hello", "start_s": 0.0, "end_s": 0.1, "prob": 0.9}, {"text": "world", "start_s": 0.2, "end_s": 0.3, "prob": 0.9}], "expected_aligned_text": "hello world", "expected": {"applied_min": 0, "required_rule_ids": [], "forbidden_rule_ids": []}, "tags": ["baseline"]}
|
||||
8
benchmarks/heuristics_dataset.raw.jsonl
Normal file
8
benchmarks/heuristics_dataset.raw.jsonl
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
{"id":"corr-time-01","transcript":"set alarm for 6 i mean 7","words":[{"text":"set","start_s":0.0,"end_s":0.1,"prob":0.9},{"text":"alarm","start_s":0.2,"end_s":0.3,"prob":0.9},{"text":"for","start_s":0.4,"end_s":0.5,"prob":0.9},{"text":"6","start_s":0.6,"end_s":0.7,"prob":0.9},{"text":"i","start_s":0.8,"end_s":0.9,"prob":0.9},{"text":"mean","start_s":1.0,"end_s":1.1,"prob":0.9},{"text":"7","start_s":1.2,"end_s":1.3,"prob":0.9}],"expected_aligned_text":"set alarm for 7","expected":{"applied_min":1,"required_rule_ids":["cue_correction"]},"tags":["i_mean_correction","timing_sensitive"]}
|
||||
{"id":"corr-time-gap-01","transcript":"set alarm for 6 i mean 7","words":[{"text":"set","start_s":0.0,"end_s":0.1,"prob":0.9},{"text":"alarm","start_s":0.2,"end_s":0.3,"prob":0.9},{"text":"for","start_s":0.4,"end_s":0.5,"prob":0.9},{"text":"6","start_s":0.6,"end_s":0.7,"prob":0.9},{"text":"i","start_s":2.0,"end_s":2.1,"prob":0.9},{"text":"mean","start_s":2.2,"end_s":2.3,"prob":0.9},{"text":"7","start_s":2.4,"end_s":2.5,"prob":0.9}],"expected_aligned_text":"set alarm for 6 i mean 7","expected":{"applied_min":0,"forbidden_rule_ids":["cue_correction"]},"tags":["i_mean_literal","timing_sensitive"]}
|
||||
{"id":"literal-mean-01","transcript":"write exactly i mean this sincerely","expected_aligned_text":"write exactly i mean this sincerely","expected":{"applied_min":0,"forbidden_rule_ids":["cue_correction"]},"tags":["i_mean_literal"]}
|
||||
{"id":"restart-01","transcript":"please send it please send it","expected_aligned_text":"please send it","expected":{"applied_min":1,"required_rule_ids":["restart_repeat"]},"tags":["restart"]}
|
||||
{"id":"actually-correction-01","transcript":"set alarm for 6 actually 7","expected_aligned_text":"set alarm for 7","expected":{"applied_min":1,"required_rule_ids":["cue_correction"]},"tags":["actually_correction"]}
|
||||
{"id":"sorry-correction-01","transcript":"set alarm for 6 sorry 7","expected_aligned_text":"set alarm for 7","expected":{"applied_min":1,"required_rule_ids":["cue_correction"]},"tags":["sorry_correction"]}
|
||||
{"id":"no-correction-phrase-01","transcript":"set alarm for 6 i mean","expected_aligned_text":"set alarm for 6 i mean","expected":{"applied_min":0,"forbidden_rule_ids":["cue_correction"]},"tags":["i_mean_literal"]}
|
||||
{"id":"baseline-unchanged-01","transcript":"hello world","expected_aligned_text":"hello world","expected":{"applied_min":0},"tags":["baseline"]}
|
||||
34
benchmarks/model_artifacts.json
Normal file
34
benchmarks/model_artifacts.json
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
"models": [
|
||||
{
|
||||
"name": "qwen2.5-1.5b-instruct-q4_k_m",
|
||||
"filename": "Qwen2.5-1.5B-Instruct-Q4_K_M.gguf",
|
||||
"url": "https://huggingface.co/bartowski/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf",
|
||||
"sha256": "1adf0b11065d8ad2e8123ea110d1ec956dab4ab038eab665614adba04b6c3370"
|
||||
},
|
||||
{
|
||||
"name": "qwen2.5-0.5b-instruct-q4_k_m",
|
||||
"filename": "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf",
|
||||
"url": "https://huggingface.co/bartowski/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/Qwen2.5-0.5B-Instruct-Q4_K_M.gguf",
|
||||
"sha256": "6eb923e7d26e9cea28811e1a8e852009b21242fb157b26149d3b188f3a8c8653"
|
||||
},
|
||||
{
|
||||
"name": "smollm2-360m-instruct-q4_k_m",
|
||||
"filename": "SmolLM2-360M-Instruct-Q4_K_M.gguf",
|
||||
"url": "https://huggingface.co/bartowski/SmolLM2-360M-Instruct-GGUF/resolve/main/SmolLM2-360M-Instruct-Q4_K_M.gguf",
|
||||
"sha256": "2fa3f013dcdd7b99f9b237717fa0b12d75bbb89984cc1274be1471a465bac9c2"
|
||||
},
|
||||
{
|
||||
"name": "llama-3.2-1b-instruct-q4_k_m",
|
||||
"filename": "Llama-3.2-1B-Instruct-Q4_K_M.gguf",
|
||||
"url": "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
|
||||
"sha256": "6f85a640a97cf2bf5b8e764087b1e83da0fdb51d7c9fab7d0fece9385611df83"
|
||||
},
|
||||
{
|
||||
"name": "llama-3.2-3b-q4_k_m",
|
||||
"filename": "Llama-3.2-3B-Instruct-Q4_K_M.gguf",
|
||||
"url": "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_K_M.gguf",
|
||||
"sha256": "6c1a2b41161032677be168d354123594c0e6e67d2b9227c84f296ad037c728ff"
|
||||
}
|
||||
]
|
||||
}
|
||||
77
benchmarks/model_matrix.small_first.json
Normal file
77
benchmarks/model_matrix.small_first.json
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
{
|
||||
"warmup_runs": 1,
|
||||
"measured_runs": 2,
|
||||
"timeout_sec": 120,
|
||||
"baseline_model": {
|
||||
"name": "qwen2.5-1.5b-instruct-q4_k_m",
|
||||
"provider": "local_llama",
|
||||
"model_path": "/path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf",
|
||||
"profile": "default",
|
||||
"param_grid": {
|
||||
"temperature": [0.0],
|
||||
"max_tokens": [192],
|
||||
"top_p": [0.95],
|
||||
"top_k": [40],
|
||||
"repeat_penalty": [1.0],
|
||||
"min_p": [0.0]
|
||||
}
|
||||
},
|
||||
"candidate_models": [
|
||||
{
|
||||
"name": "qwen2.5-0.5b-instruct-q4_k_m",
|
||||
"provider": "local_llama",
|
||||
"model_path": "/path/to/Qwen2.5-0.5B-Instruct-Q4_K_M.gguf",
|
||||
"profile": "fast",
|
||||
"param_grid": {
|
||||
"temperature": [0.0, 0.1],
|
||||
"max_tokens": [96, 128],
|
||||
"top_p": [0.9, 0.95],
|
||||
"top_k": [20, 40],
|
||||
"repeat_penalty": [1.0, 1.1],
|
||||
"min_p": [0.0, 0.05]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "smollm2-360m-instruct-q4_k_m",
|
||||
"provider": "local_llama",
|
||||
"model_path": "/path/to/SmolLM2-360M-Instruct-Q4_K_M.gguf",
|
||||
"profile": "fast",
|
||||
"param_grid": {
|
||||
"temperature": [0.0, 0.1, 0.2],
|
||||
"max_tokens": [96, 128],
|
||||
"top_p": [0.9, 0.95],
|
||||
"top_k": [20, 40],
|
||||
"repeat_penalty": [1.0, 1.1],
|
||||
"min_p": [0.0, 0.05]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "llama-3.2-1b-instruct-q4_k_m",
|
||||
"provider": "local_llama",
|
||||
"model_path": "/path/to/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
|
||||
"profile": "fast",
|
||||
"param_grid": {
|
||||
"temperature": [0.0, 0.1],
|
||||
"max_tokens": [128, 192],
|
||||
"top_p": [0.9, 0.95],
|
||||
"top_k": [20, 40],
|
||||
"repeat_penalty": [1.0, 1.1],
|
||||
"min_p": [0.0, 0.05]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "llama-3.2-3b-q4_k_m",
|
||||
"provider": "local_llama",
|
||||
"model_path": "/path/to/Llama-3.2-3B-Instruct-Q4_K_M.gguf",
|
||||
"profile": "default",
|
||||
"param_grid": {
|
||||
"temperature": [0.0, 0.1],
|
||||
"max_tokens": [192, 256],
|
||||
"top_p": [0.9, 0.95],
|
||||
"top_k": [20, 40],
|
||||
"repeat_penalty": [1.0, 1.1],
|
||||
"min_p": [0.0, 0.05]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
12
benchmarks/results/latest.json
Normal file
12
benchmarks/results/latest.json
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
{
|
||||
"report_version": 2,
|
||||
"winner_recommendation": {
|
||||
"name": "qwen2.5-1.5b-instruct-q4_k_m",
|
||||
"reason": "fastest eligible model with combined-score quality floor"
|
||||
},
|
||||
"models": [],
|
||||
"notes": {
|
||||
"source": "latest model speed/quality sweep",
|
||||
"updated_by": "manual promotion"
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue