Skip to content

Commit 662db6e

Browse files
Add preflight LLM check before dispatching evaluations (#2109)
Co-authored-by: openhands <openhands@all-hands.dev>
1 parent 8987bdf commit 662db6e

File tree

3 files changed

+309
-5
lines changed

3 files changed

+309
-5
lines changed

.github/run-eval/resolve_model_config.py

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
#!/usr/bin/env python3
22
"""
3-
Resolve model IDs to full model configurations.
3+
Resolve model IDs to full model configurations and verify model availability.
44
55
Reads:
66
- MODEL_IDS: comma-separated model IDs
7+
- LLM_API_KEY: API key for litellm_proxy (optional, for preflight check)
8+
- LLM_BASE_URL: Base URL for litellm_proxy (optional, defaults to eval proxy)
9+
- SKIP_PREFLIGHT: Set to 'true' to skip the preflight LLM check
710
811
Outputs to GITHUB_OUTPUT:
912
- models_json: JSON array of full model configs with display names
@@ -12,6 +15,9 @@
1215
import json
1316
import os
1417
import sys
18+
from typing import Any
19+
20+
import litellm
1521

1622

1723
# Model configurations dictionary
@@ -212,6 +218,101 @@ def find_models_by_id(model_ids: list[str]) -> list[dict]:
212218
return resolved
213219

214220

221+
def test_model(
222+
model_config: dict[str, Any],
223+
api_key: str,
224+
base_url: str,
225+
timeout: int = 60,
226+
) -> tuple[bool, str]:
227+
"""Test a single model with a simple completion request using litellm.
228+
229+
Args:
230+
model_config: Model configuration dict with 'llm_config' key
231+
api_key: API key for authentication
232+
base_url: Base URL for the LLM proxy
233+
timeout: Request timeout in seconds
234+
235+
Returns:
236+
Tuple of (success: bool, message: str)
237+
"""
238+
llm_config = model_config.get("llm_config", {})
239+
model_name = llm_config.get("model", "unknown")
240+
display_name = model_config.get("display_name", model_name)
241+
242+
try:
243+
# Build kwargs from llm_config, excluding 'model' which is passed separately
244+
kwargs = {k: v for k, v in llm_config.items() if k != "model"}
245+
246+
response = litellm.completion(
247+
model=model_name,
248+
messages=[{"role": "user", "content": "Say 'OK' if you can read this."}],
249+
max_tokens=10,
250+
api_key=api_key,
251+
base_url=base_url,
252+
timeout=timeout,
253+
**kwargs,
254+
)
255+
256+
content = response.choices[0].message.content if response.choices else None
257+
if content:
258+
return True, f"✓ {display_name}: OK"
259+
else:
260+
return False, f"✗ {display_name}: Empty response"
261+
262+
except litellm.exceptions.Timeout:
263+
return False, f"✗ {display_name}: Request timed out after {timeout}s"
264+
except litellm.exceptions.APIConnectionError as e:
265+
return False, f"✗ {display_name}: Connection error - {e}"
266+
except litellm.exceptions.BadRequestError as e:
267+
return False, f"✗ {display_name}: Bad request - {e}"
268+
except litellm.exceptions.NotFoundError as e:
269+
return False, f"✗ {display_name}: Model not found - {e}"
270+
except Exception as e:
271+
return False, f"✗ {display_name}: {type(e).__name__} - {e}"
272+
273+
274+
def run_preflight_check(models: list[dict[str, Any]]) -> bool:
275+
"""Run preflight LLM check for all models.
276+
277+
Args:
278+
models: List of model configurations to test
279+
280+
Returns:
281+
True if all models passed, False otherwise
282+
"""
283+
api_key = os.environ.get("LLM_API_KEY")
284+
base_url = os.environ.get("LLM_BASE_URL", "https://llm-proxy.eval.all-hands.dev")
285+
skip_preflight = os.environ.get("SKIP_PREFLIGHT", "").lower() == "true"
286+
287+
if skip_preflight:
288+
print("Preflight check: SKIPPED (SKIP_PREFLIGHT=true)")
289+
return True
290+
291+
if not api_key:
292+
print("Preflight check: SKIPPED (LLM_API_KEY not set)")
293+
return True
294+
295+
print(f"\nPreflight LLM check for {len(models)} model(s)...")
296+
print("-" * 50)
297+
298+
all_passed = True
299+
for model_config in models:
300+
success, message = test_model(model_config, api_key, base_url)
301+
print(message)
302+
if not success:
303+
all_passed = False
304+
305+
print("-" * 50)
306+
307+
if all_passed:
308+
print(f"✓ All {len(models)} model(s) passed preflight check\n")
309+
else:
310+
print("✗ Some models failed preflight check")
311+
print("Evaluation aborted to avoid wasting compute resources.\n")
312+
313+
return all_passed
314+
315+
215316
def main() -> None:
216317
model_ids_str = get_required_env("MODEL_IDS")
217318
github_output = get_required_env("GITHUB_OUTPUT")
@@ -221,14 +322,17 @@ def main() -> None:
221322

222323
# Resolve model configs
223324
resolved = find_models_by_id(model_ids)
325+
print(f"Resolved {len(resolved)} model(s): {', '.join(model_ids)}")
326+
327+
# Run preflight check
328+
if not run_preflight_check(resolved):
329+
error_exit("Preflight LLM check failed")
224330

225331
# Output as JSON
226332
models_json = json.dumps(resolved, separators=(",", ":"))
227333
with open(github_output, "a", encoding="utf-8") as f:
228334
f.write(f"models_json={models_json}\n")
229335

230-
print(f"Resolved {len(resolved)} model(s): {', '.join(model_ids)}")
231-
232336

233337
if __name__ == "__main__":
234338
main()

.github/workflows/run-eval.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,10 +231,16 @@ jobs:
231231
printf 'pr_number=%s\n' "$PR_NUMBER" >> "$GITHUB_OUTPUT"
232232
printf 'trigger_desc=%s\n' "$TRIGGER_DESCRIPTION" >> "$GITHUB_OUTPUT"
233233
234-
- name: Resolve model configurations
234+
- name: Install dependencies
235+
run: |
236+
pip install 'litellm>=1.81.0'
237+
238+
- name: Resolve model configurations and verify availability
235239
id: resolve-models
236240
env:
237241
MODEL_IDS: ${{ steps.params.outputs.models }}
242+
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
243+
LLM_BASE_URL: https://llm-proxy.eval.all-hands.dev
238244
run: |
239245
python3 .github/run-eval/resolve_model_config.py
240246

tests/github_workflows/test_resolve_model_config.py

Lines changed: 195 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sys
44
from pathlib import Path
55
from typing import Any
6-
from unittest.mock import patch
6+
from unittest.mock import MagicMock, patch
77

88
import pytest
99
from pydantic import BaseModel, field_validator, model_validator
@@ -15,6 +15,8 @@
1515
from resolve_model_config import ( # noqa: E402 # type: ignore[import-not-found]
1616
MODELS,
1717
find_models_by_id,
18+
run_preflight_check,
19+
test_model,
1820
)
1921

2022

@@ -254,3 +256,195 @@ def test_glm_5_config():
254256
assert model["display_name"] == "GLM-5"
255257
assert model["llm_config"]["model"] == "litellm_proxy/openrouter/z-ai/glm-5"
256258
assert model["llm_config"]["disable_vision"] is True
259+
260+
261+
# Tests for preflight check functionality
262+
263+
264+
class TestTestModel:
265+
"""Tests for the test_model function."""
266+
267+
def test_successful_response(self):
268+
"""Test that a successful model response returns True."""
269+
model_config = {
270+
"display_name": "Test Model",
271+
"llm_config": {"model": "litellm_proxy/test-model"},
272+
}
273+
mock_response = MagicMock()
274+
mock_response.choices = [MagicMock(message=MagicMock(content="OK"))]
275+
276+
with patch(
277+
"resolve_model_config.litellm.completion", return_value=mock_response
278+
):
279+
success, message = test_model(model_config, "test-key", "https://test.com")
280+
281+
assert success is True
282+
assert "✓" in message
283+
assert "Test Model" in message
284+
285+
def test_empty_response(self):
286+
"""Test that an empty response returns False."""
287+
model_config = {
288+
"display_name": "Test Model",
289+
"llm_config": {"model": "litellm_proxy/test-model"},
290+
}
291+
mock_response = MagicMock()
292+
mock_response.choices = [MagicMock(message=MagicMock(content=""))]
293+
294+
with patch(
295+
"resolve_model_config.litellm.completion", return_value=mock_response
296+
):
297+
success, message = test_model(model_config, "test-key", "https://test.com")
298+
299+
assert success is False
300+
assert "✗" in message
301+
assert "Empty response" in message
302+
303+
def test_timeout_error(self):
304+
"""Test that timeout errors are handled correctly."""
305+
import litellm
306+
307+
model_config = {
308+
"display_name": "Test Model",
309+
"llm_config": {"model": "litellm_proxy/test-model"},
310+
}
311+
312+
with patch(
313+
"resolve_model_config.litellm.completion",
314+
side_effect=litellm.exceptions.Timeout(
315+
message="Timeout", model="test-model", llm_provider="test"
316+
),
317+
):
318+
success, message = test_model(model_config, "test-key", "https://test.com")
319+
320+
assert success is False
321+
assert "✗" in message
322+
assert "timed out" in message
323+
324+
def test_connection_error(self):
325+
"""Test that connection errors are handled correctly."""
326+
import litellm
327+
328+
model_config = {
329+
"display_name": "Test Model",
330+
"llm_config": {"model": "litellm_proxy/test-model"},
331+
}
332+
333+
with patch(
334+
"resolve_model_config.litellm.completion",
335+
side_effect=litellm.exceptions.APIConnectionError(
336+
message="Connection failed", llm_provider="test", model="test-model"
337+
),
338+
):
339+
success, message = test_model(model_config, "test-key", "https://test.com")
340+
341+
assert success is False
342+
assert "✗" in message
343+
assert "Connection error" in message
344+
345+
def test_model_not_found_error(self):
346+
"""Test that model not found errors are handled correctly."""
347+
import litellm
348+
349+
model_config = {
350+
"display_name": "Test Model",
351+
"llm_config": {"model": "litellm_proxy/test-model"},
352+
}
353+
354+
with patch(
355+
"resolve_model_config.litellm.completion",
356+
side_effect=litellm.exceptions.NotFoundError(
357+
"Model not found", llm_provider="test", model="test-model"
358+
),
359+
):
360+
success, message = test_model(model_config, "test-key", "https://test.com")
361+
362+
assert success is False
363+
assert "✗" in message
364+
assert "not found" in message
365+
366+
def test_passes_llm_config_params(self):
367+
"""Test that llm_config parameters are passed to litellm."""
368+
model_config = {
369+
"display_name": "Test Model",
370+
"llm_config": {
371+
"model": "litellm_proxy/test-model",
372+
"temperature": 0.5,
373+
"top_p": 0.9,
374+
},
375+
}
376+
mock_response = MagicMock()
377+
mock_response.choices = [MagicMock(message=MagicMock(content="OK"))]
378+
379+
with patch(
380+
"resolve_model_config.litellm.completion", return_value=mock_response
381+
) as mock_completion:
382+
test_model(model_config, "test-key", "https://test.com")
383+
384+
mock_completion.assert_called_once()
385+
call_kwargs = mock_completion.call_args[1]
386+
assert call_kwargs["temperature"] == 0.5
387+
assert call_kwargs["top_p"] == 0.9
388+
389+
390+
class TestRunPreflightCheck:
391+
"""Tests for the run_preflight_check function."""
392+
393+
def test_skip_when_no_api_key(self):
394+
"""Test that preflight check is skipped when LLM_API_KEY is not set."""
395+
models = [{"display_name": "Test", "llm_config": {"model": "test"}}]
396+
397+
with patch.dict("os.environ", {}, clear=True):
398+
result = run_preflight_check(models)
399+
400+
assert result is True # Skipped = success
401+
402+
def test_skip_when_skip_preflight_true(self):
403+
"""Test that preflight check is skipped when SKIP_PREFLIGHT=true."""
404+
models = [{"display_name": "Test", "llm_config": {"model": "test"}}]
405+
406+
with patch.dict(
407+
"os.environ", {"LLM_API_KEY": "test", "SKIP_PREFLIGHT": "true"}
408+
):
409+
result = run_preflight_check(models)
410+
411+
assert result is True # Skipped = success
412+
413+
def test_all_models_pass(self):
414+
"""Test that preflight check returns True when all models pass."""
415+
models = [
416+
{"display_name": "Model A", "llm_config": {"model": "model-a"}},
417+
{"display_name": "Model B", "llm_config": {"model": "model-b"}},
418+
]
419+
mock_response = MagicMock()
420+
mock_response.choices = [MagicMock(message=MagicMock(content="OK"))]
421+
422+
with patch.dict("os.environ", {"LLM_API_KEY": "test"}):
423+
with patch(
424+
"resolve_model_config.litellm.completion", return_value=mock_response
425+
):
426+
result = run_preflight_check(models)
427+
428+
assert result is True
429+
430+
def test_any_model_fails(self):
431+
"""Test that preflight check returns False when any model fails."""
432+
models = [
433+
{"display_name": "Model A", "llm_config": {"model": "model-a"}},
434+
{"display_name": "Model B", "llm_config": {"model": "model-b"}},
435+
]
436+
mock_response = MagicMock()
437+
mock_response.choices = [MagicMock(message=MagicMock(content="OK"))]
438+
439+
def mock_completion(**kwargs):
440+
if kwargs["model"] == "model-b":
441+
raise Exception("Model B failed")
442+
return mock_response
443+
444+
with patch.dict("os.environ", {"LLM_API_KEY": "test"}):
445+
with patch(
446+
"resolve_model_config.litellm.completion", side_effect=mock_completion
447+
):
448+
result = run_preflight_check(models)
449+
450+
assert result is False

0 commit comments

Comments
 (0)