Add benchmark-driven model promotion workflow and pipeline stages
Some checks failed
ci / test-and-build (push) Has been cancelled

This commit is contained in:
Thales Maciel 2026-02-28 15:12:33 -03:00
parent 98b13d1069
commit 8c1f7c1e13
38 changed files with 5300 additions and 503 deletions

View file

@ -15,8 +15,11 @@ if str(SRC) not in sys.path:
import aiprocess
from aiprocess import (
ExternalApiProcessor,
LlamaProcessor,
_assert_expected_model_checksum,
_build_request_payload,
_build_user_prompt_xml,
_explicit_generation_kwargs,
_extract_cleaned_text,
_profile_generation_kwargs,
_supports_response_format,
@ -114,6 +117,75 @@ class SupportsResponseFormatTests(unittest.TestCase):
self.assertEqual(kwargs, {})
def test_explicit_generation_kwargs_honors_supported_params(self):
def chat_completion(*, messages, temperature, top_p, max_tokens):
return None
kwargs = _explicit_generation_kwargs(
chat_completion,
top_p=0.9,
top_k=40,
max_tokens=128,
repeat_penalty=1.1,
min_p=0.05,
)
self.assertEqual(kwargs, {"top_p": 0.9, "max_tokens": 128})
class _WarmupClient:
def __init__(self, response_payload: dict):
self.response_payload = response_payload
self.calls = []
def create_chat_completion(
self,
*,
messages,
temperature,
response_format=None,
max_tokens=None,
):
self.calls.append(
{
"messages": messages,
"temperature": temperature,
"response_format": response_format,
"max_tokens": max_tokens,
}
)
return self.response_payload
class LlamaWarmupTests(unittest.TestCase):
def test_warmup_uses_json_mode_and_low_token_budget(self):
processor = object.__new__(LlamaProcessor)
client = _WarmupClient(
{"choices": [{"message": {"content": '{"cleaned_text":"ok"}'}}]}
)
processor.client = client
processor.warmup(profile="fast")
self.assertEqual(len(client.calls), 1)
call = client.calls[0]
self.assertEqual(call["temperature"], 0.0)
self.assertEqual(call["response_format"], {"type": "json_object"})
self.assertEqual(call["max_tokens"], 32)
user_content = call["messages"][1]["content"]
self.assertIn("<request>", user_content)
self.assertIn("<transcript>warmup</transcript>", user_content)
self.assertIn("<language>auto</language>", user_content)
def test_warmup_raises_on_non_json_response(self):
processor = object.__new__(LlamaProcessor)
client = _WarmupClient(
{"choices": [{"message": {"content": "not-json"}}]}
)
processor.client = client
with self.assertRaisesRegex(RuntimeError, "expected JSON"):
processor.warmup(profile="default")
class ModelChecksumTests(unittest.TestCase):
def test_accepts_expected_checksum_case_insensitive(self):
@ -137,6 +209,19 @@ class RequestPayloadTests(unittest.TestCase):
self.assertEqual(payload["transcript"], "hello")
self.assertNotIn("dictionary", payload)
def test_user_prompt_is_xml_and_escapes_literals(self):
payload = _build_request_payload(
'keep <transcript> and "quotes"',
lang="en",
dictionary_context="Docker & systemd",
)
xml = _build_user_prompt_xml(payload)
self.assertIn("<request>", xml)
self.assertIn("<language>en</language>", xml)
self.assertIn("&lt;transcript&gt;", xml)
self.assertIn("&amp;", xml)
self.assertIn("<output_contract>", xml)
class _Response:
def __init__(self, payload: bytes):
@ -254,6 +339,21 @@ class ExternalApiProcessorTests(unittest.TestCase):
request = urlopen.call_args[0][0]
self.assertTrue(request.full_url.endswith("/chat/completions"))
def test_warmup_is_a_noop(self):
with patch.dict(os.environ, {"AMAN_EXTERNAL_API_KEY": "test-key"}, clear=True):
processor = ExternalApiProcessor(
provider="openai",
base_url="https://api.openai.com/v1",
model="gpt-4o-mini",
api_key_env_var="AMAN_EXTERNAL_API_KEY",
timeout_ms=1000,
max_retries=0,
)
with patch("aiprocess.urllib.request.urlopen") as urlopen:
processor.warmup(profile="fast")
urlopen.assert_not_called()
if __name__ == "__main__":
unittest.main()