Add benchmark-driven model promotion workflow and pipeline stages
Some checks failed
ci / test-and-build (push) Has been cancelled

This commit is contained in:
Thales Maciel 2026-02-28 15:12:33 -03:00
parent 98b13d1069
commit 8c1f7c1e13
38 changed files with 5300 additions and 503 deletions

View file

@ -6,7 +6,15 @@ BUILD_DIR := $(CURDIR)/build
RUN_ARGS := $(wordlist 2,$(words $(MAKECMDGOALS)),$(MAKECMDGOALS))
RUN_CONFIG := $(if $(RUN_ARGS),$(abspath $(firstword $(RUN_ARGS))),$(CONFIG))
.PHONY: run doctor self-check sync test check build package package-deb package-arch release-check install-local install-service install clean-dist clean-build clean
.PHONY: run doctor self-check eval-models build-heuristic-dataset sync-default-model check-default-model sync test check build package package-deb package-arch release-check install-local install-service install clean-dist clean-build clean
EVAL_DATASET ?= $(CURDIR)/benchmarks/cleanup_dataset.jsonl
EVAL_MATRIX ?= $(CURDIR)/benchmarks/model_matrix.small_first.json
EVAL_OUTPUT ?= $(CURDIR)/benchmarks/results/latest.json
EVAL_HEURISTIC_RAW ?= $(CURDIR)/benchmarks/heuristics_dataset.raw.jsonl
EVAL_HEURISTIC_DATASET ?= $(CURDIR)/benchmarks/heuristics_dataset.jsonl
EVAL_HEURISTIC_WEIGHT ?= 0.25
MODEL_ARTIFACTS ?= $(CURDIR)/benchmarks/model_artifacts.json
CONSTANTS_FILE ?= $(CURDIR)/src/constants.py
ifneq ($(filter run,$(firstword $(MAKECMDGOALS))),)
.PHONY: $(RUN_ARGS)
@ -23,6 +31,18 @@ doctor:
self-check:
uv run aman self-check --config $(CONFIG)
build-heuristic-dataset:
uv run aman build-heuristic-dataset --input $(EVAL_HEURISTIC_RAW) --output $(EVAL_HEURISTIC_DATASET)
eval-models: build-heuristic-dataset
uv run aman eval-models --dataset $(EVAL_DATASET) --matrix $(EVAL_MATRIX) --heuristic-dataset $(EVAL_HEURISTIC_DATASET) --heuristic-weight $(EVAL_HEURISTIC_WEIGHT) --output $(EVAL_OUTPUT)
sync-default-model:
uv run aman sync-default-model --report $(EVAL_OUTPUT) --artifacts $(MODEL_ARTIFACTS) --constants $(CONSTANTS_FILE)
check-default-model:
uv run aman sync-default-model --check --report $(EVAL_OUTPUT) --artifacts $(MODEL_ARTIFACTS) --constants $(CONSTANTS_FILE)
sync:
uv sync
@ -45,6 +65,7 @@ package-arch:
./scripts/package_arch.sh
release-check:
$(MAKE) check-default-model
$(PYTHON) -m py_compile src/*.py tests/*.py
$(MAKE) test
$(MAKE) build