Add benchmark-driven model promotion workflow and pipeline stages

2026-02-28 15:12:33 -03:00 · 2026-02-28 15:12:33 -03:00 · 8c1f7c1e13
commit 8c1f7c1e13
parent 98b13d1069
38 changed files with 5300 additions and 503 deletions
--- a/tests/test_aiprocess.py
+++ b/tests/test_aiprocess.py
@ -15,8 +15,11 @@ if str(SRC) not in sys.path:
 import aiprocess
 from aiprocess import (
    ExternalApiProcessor,
+    LlamaProcessor,
    _assert_expected_model_checksum,
    _build_request_payload,
+    _build_user_prompt_xml,
+    _explicit_generation_kwargs,
    _extract_cleaned_text,
    _profile_generation_kwargs,
    _supports_response_format,
@ -114,6 +117,75 @@ class SupportsResponseFormatTests(unittest.TestCase):

        self.assertEqual(kwargs, {})

+    def test_explicit_generation_kwargs_honors_supported_params(self):
+        def chat_completion(*, messages, temperature, top_p, max_tokens):
+            return None
+
+        kwargs = _explicit_generation_kwargs(
+            chat_completion,
+            top_p=0.9,
+            top_k=40,
+            max_tokens=128,
+            repeat_penalty=1.1,
+            min_p=0.05,
+        )
+        self.assertEqual(kwargs, {"top_p": 0.9, "max_tokens": 128})
+
+
+class _WarmupClient:
+    def __init__(self, response_payload: dict):
+        self.response_payload = response_payload
+        self.calls = []
+
+    def create_chat_completion(
+        self,
+        *,
+        messages,
+        temperature,
+        response_format=None,
+        max_tokens=None,
+    ):
+        self.calls.append(
+            {
+                "messages": messages,
+                "temperature": temperature,
+                "response_format": response_format,
+                "max_tokens": max_tokens,
+            }
+        )
+        return self.response_payload
+
+
+class LlamaWarmupTests(unittest.TestCase):
+    def test_warmup_uses_json_mode_and_low_token_budget(self):
+        processor = object.__new__(LlamaProcessor)
+        client = _WarmupClient(
+            {"choices": [{"message": {"content": '{"cleaned_text":"ok"}'}}]}
+        )
+        processor.client = client
+
+        processor.warmup(profile="fast")
+
+        self.assertEqual(len(client.calls), 1)
+        call = client.calls[0]
+        self.assertEqual(call["temperature"], 0.0)
+        self.assertEqual(call["response_format"], {"type": "json_object"})
+        self.assertEqual(call["max_tokens"], 32)
+        user_content = call["messages"][1]["content"]
+        self.assertIn("<request>", user_content)
+        self.assertIn("<transcript>warmup</transcript>", user_content)
+        self.assertIn("<language>auto</language>", user_content)
+
+    def test_warmup_raises_on_non_json_response(self):
+        processor = object.__new__(LlamaProcessor)
+        client = _WarmupClient(
+            {"choices": [{"message": {"content": "not-json"}}]}
+        )
+        processor.client = client
+
+        with self.assertRaisesRegex(RuntimeError, "expected JSON"):
+            processor.warmup(profile="default")
+

 class ModelChecksumTests(unittest.TestCase):
    def test_accepts_expected_checksum_case_insensitive(self):
@ -137,6 +209,19 @@ class RequestPayloadTests(unittest.TestCase):
        self.assertEqual(payload["transcript"], "hello")
        self.assertNotIn("dictionary", payload)

+    def test_user_prompt_is_xml_and_escapes_literals(self):
+        payload = _build_request_payload(
+            'keep <transcript> and "quotes"',
+            lang="en",
+            dictionary_context="Docker & systemd",
+        )
+        xml = _build_user_prompt_xml(payload)
+        self.assertIn("<request>", xml)
+        self.assertIn("<language>en</language>", xml)
+        self.assertIn("&lt;transcript&gt;", xml)
+        self.assertIn("&amp;", xml)
+        self.assertIn("<output_contract>", xml)
+

 class _Response:
    def __init__(self, payload: bytes):
@ -254,6 +339,21 @@ class ExternalApiProcessorTests(unittest.TestCase):
        request = urlopen.call_args[0][0]
        self.assertTrue(request.full_url.endswith("/chat/completions"))

+    def test_warmup_is_a_noop(self):
+        with patch.dict(os.environ, {"AMAN_EXTERNAL_API_KEY": "test-key"}, clear=True):
+            processor = ExternalApiProcessor(
+                provider="openai",
+                base_url="https://api.openai.com/v1",
+                model="gpt-4o-mini",
+                api_key_env_var="AMAN_EXTERNAL_API_KEY",
+                timeout_ms=1000,
+                max_retries=0,
+            )
+            with patch("aiprocess.urllib.request.urlopen") as urlopen:
+                processor.warmup(profile="fast")
+
+        urlopen.assert_not_called()
+

 if __name__ == "__main__":
    unittest.main()