Add preflight LLM check before dispatching evaluations (#2109)

neubig · openhands-agent · web-flow · commit 662db6e4f1b3 · 2026-02-17T21:36:45.000-05:00
Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
@@ -1,9 +1,12 @@
 #!/usr/bin/env python3
 """
-Resolve model IDs to full model configurations.
+Resolve model IDs to full model configurations and verify model availability.
 
 Reads:
 - MODEL_IDS: comma-separated model IDs
+- LLM_API_KEY: API key for litellm_proxy (optional, for preflight check)
+- LLM_BASE_URL: Base URL for litellm_proxy (optional, defaults to eval proxy)
+- SKIP_PREFLIGHT: Set to 'true' to skip the preflight LLM check
 
 Outputs to GITHUB_OUTPUT:
 - models_json: JSON array of full model configs with display names
@@ -12,6 +15,9 @@
 import json
 import os
 import sys
+from typing import Any
+
+import litellm
 
 
 # Model configurations dictionary
@@ -212,6 +218,101 @@ def find_models_by_id(model_ids: list[str]) -> list[dict]:
     return resolved
 
 
+def test_model(
+    model_config: dict[str, Any],
+    api_key: str,
+    base_url: str,
+    timeout: int = 60,
+) -> tuple[bool, str]:
+    """Test a single model with a simple completion request using litellm.
+
+    Args:
+        model_config: Model configuration dict with 'llm_config' key
+        api_key: API key for authentication
+        base_url: Base URL for the LLM proxy
+        timeout: Request timeout in seconds
+
+    Returns:
+        Tuple of (success: bool, message: str)
+    """
+    llm_config = model_config.get("llm_config", {})
+    model_name = llm_config.get("model", "unknown")
+    display_name = model_config.get("display_name", model_name)
+
+    try:
+        # Build kwargs from llm_config, excluding 'model' which is passed separately
+        kwargs = {k: v for k, v in llm_config.items() if k != "model"}
+
+        response = litellm.completion(
+            model=model_name,
+            messages=[{"role": "user", "content": "Say 'OK' if you can read this."}],
+            max_tokens=10,
+            api_key=api_key,
+            base_url=base_url,
+            timeout=timeout,
+            **kwargs,
+        )
+
+        content = response.choices[0].message.content if response.choices else None
+        if content:
+            return True, f"✓ {display_name}: OK"
+        else:
+            return False, f"✗ {display_name}: Empty response"
+
+    except litellm.exceptions.Timeout:
+        return False, f"✗ {display_name}: Request timed out after {timeout}s"
+    except litellm.exceptions.APIConnectionError as e:
+        return False, f"✗ {display_name}: Connection error - {e}"
+    except litellm.exceptions.BadRequestError as e:
+        return False, f"✗ {display_name}: Bad request - {e}"
+    except litellm.exceptions.NotFoundError as e:
+        return False, f"✗ {display_name}: Model not found - {e}"
+    except Exception as e:
+        return False, f"✗ {display_name}: {type(e).__name__} - {e}"
+
+
+def run_preflight_check(models: list[dict[str, Any]]) -> bool:
+    """Run preflight LLM check for all models.
+
+    Args:
+        models: List of model configurations to test
+
+    Returns:
+        True if all models passed, False otherwise
+    """
+    api_key = os.environ.get("LLM_API_KEY")
+    base_url = os.environ.get("LLM_BASE_URL", "https://llm-proxy.eval.all-hands.dev")
+    skip_preflight = os.environ.get("SKIP_PREFLIGHT", "").lower() == "true"
+
+    if skip_preflight:
+        print("Preflight check: SKIPPED (SKIP_PREFLIGHT=true)")
+        return True
+
+    if not api_key:
+        print("Preflight check: SKIPPED (LLM_API_KEY not set)")
+        return True
+
+    print(f"\nPreflight LLM check for {len(models)} model(s)...")
+    print("-" * 50)
+
+    all_passed = True
+    for model_config in models:
+        success, message = test_model(model_config, api_key, base_url)
+        print(message)
+        if not success:
+            all_passed = False
+
+    print("-" * 50)
+
+    if all_passed:
+        print(f"✓ All {len(models)} model(s) passed preflight check\n")
+    else:
+        print("✗ Some models failed preflight check")
+        print("Evaluation aborted to avoid wasting compute resources.\n")
+
+    return all_passed
+
+
 def main() -> None:
     model_ids_str = get_required_env("MODEL_IDS")
     github_output = get_required_env("GITHUB_OUTPUT")
@@ -221,14 +322,17 @@ def main() -> None:
 
     # Resolve model configs
     resolved = find_models_by_id(model_ids)
+    print(f"Resolved {len(resolved)} model(s): {', '.join(model_ids)}")
+
+    # Run preflight check
+    if not run_preflight_check(resolved):
+        error_exit("Preflight LLM check failed")
 
     # Output as JSON
     models_json = json.dumps(resolved, separators=(",", ":"))
     with open(github_output, "a", encoding="utf-8") as f:
         f.write(f"models_json={models_json}\n")
 
-    print(f"Resolved {len(resolved)} model(s): {', '.join(model_ids)}")
-
 
 if __name__ == "__main__":
     main()
diff --git a/.github/workflows/run-eval.yml b/.github/workflows/run-eval.yml
@@ -231,10 +231,16 @@ jobs:
                   printf 'pr_number=%s\n' "$PR_NUMBER" >> "$GITHUB_OUTPUT"
                   printf 'trigger_desc=%s\n' "$TRIGGER_DESCRIPTION" >> "$GITHUB_OUTPUT"
 
-            - name: Resolve model configurations
+            - name: Install dependencies
+              run: |
+                  pip install 'litellm>=1.81.0'
+
+            - name: Resolve model configurations and verify availability
               id: resolve-models
               env:
                   MODEL_IDS: ${{ steps.params.outputs.models }}
+                  LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+                  LLM_BASE_URL: https://llm-proxy.eval.all-hands.dev
               run: |
                   python3 .github/run-eval/resolve_model_config.py
 
diff --git a/tests/github_workflows/test_resolve_model_config.py b/tests/github_workflows/test_resolve_model_config.py
@@ -3,7 +3,7 @@
 import sys
 from pathlib import Path
 from typing import Any
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 import pytest
 from pydantic import BaseModel, field_validator, model_validator
@@ -15,6 +15,8 @@
 from resolve_model_config import (  # noqa: E402  # type: ignore[import-not-found]
     MODELS,
     find_models_by_id,
+    run_preflight_check,
+    test_model,
 )
 
 
@@ -254,3 +256,195 @@ def test_glm_5_config():
     assert model["display_name"] == "GLM-5"
     assert model["llm_config"]["model"] == "litellm_proxy/openrouter/z-ai/glm-5"
     assert model["llm_config"]["disable_vision"] is True
+
+
+# Tests for preflight check functionality
+
+
+class TestTestModel:
+    """Tests for the test_model function."""
+
+    def test_successful_response(self):
+        """Test that a successful model response returns True."""
+        model_config = {
+            "display_name": "Test Model",
+            "llm_config": {"model": "litellm_proxy/test-model"},
+        }
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock(message=MagicMock(content="OK"))]
+
+        with patch(
+            "resolve_model_config.litellm.completion", return_value=mock_response
+        ):
+            success, message = test_model(model_config, "test-key", "https://test.com")
+
+        assert success is True
+        assert "✓" in message
+        assert "Test Model" in message
+
+    def test_empty_response(self):
+        """Test that an empty response returns False."""
+        model_config = {
+            "display_name": "Test Model",
+            "llm_config": {"model": "litellm_proxy/test-model"},
+        }
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock(message=MagicMock(content=""))]
+
+        with patch(
+            "resolve_model_config.litellm.completion", return_value=mock_response
+        ):
+            success, message = test_model(model_config, "test-key", "https://test.com")
+
+        assert success is False
+        assert "✗" in message
+        assert "Empty response" in message
+
+    def test_timeout_error(self):
+        """Test that timeout errors are handled correctly."""
+        import litellm
+
+        model_config = {
+            "display_name": "Test Model",
+            "llm_config": {"model": "litellm_proxy/test-model"},
+        }
+
+        with patch(
+            "resolve_model_config.litellm.completion",
+            side_effect=litellm.exceptions.Timeout(
+                message="Timeout", model="test-model", llm_provider="test"
+            ),
+        ):
+            success, message = test_model(model_config, "test-key", "https://test.com")
+
+        assert success is False
+        assert "✗" in message
+        assert "timed out" in message
+
+    def test_connection_error(self):
+        """Test that connection errors are handled correctly."""
+        import litellm
+
+        model_config = {
+            "display_name": "Test Model",
+            "llm_config": {"model": "litellm_proxy/test-model"},
+        }
+
+        with patch(
+            "resolve_model_config.litellm.completion",
+            side_effect=litellm.exceptions.APIConnectionError(
+                message="Connection failed", llm_provider="test", model="test-model"
+            ),
+        ):
+            success, message = test_model(model_config, "test-key", "https://test.com")
+
+        assert success is False
+        assert "✗" in message
+        assert "Connection error" in message
+
+    def test_model_not_found_error(self):
+        """Test that model not found errors are handled correctly."""
+        import litellm
+
+        model_config = {
+            "display_name": "Test Model",
+            "llm_config": {"model": "litellm_proxy/test-model"},
+        }
+
+        with patch(
+            "resolve_model_config.litellm.completion",
+            side_effect=litellm.exceptions.NotFoundError(
+                "Model not found", llm_provider="test", model="test-model"
+            ),
+        ):
+            success, message = test_model(model_config, "test-key", "https://test.com")
+
+        assert success is False
+        assert "✗" in message
+        assert "not found" in message
+
+    def test_passes_llm_config_params(self):
+        """Test that llm_config parameters are passed to litellm."""
+        model_config = {
+            "display_name": "Test Model",
+            "llm_config": {
+                "model": "litellm_proxy/test-model",
+                "temperature": 0.5,
+                "top_p": 0.9,
+            },
+        }
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock(message=MagicMock(content="OK"))]
+
+        with patch(
+            "resolve_model_config.litellm.completion", return_value=mock_response
+        ) as mock_completion:
+            test_model(model_config, "test-key", "https://test.com")
+
+        mock_completion.assert_called_once()
+        call_kwargs = mock_completion.call_args[1]
+        assert call_kwargs["temperature"] == 0.5
+        assert call_kwargs["top_p"] == 0.9
+
+
+class TestRunPreflightCheck:
+    """Tests for the run_preflight_check function."""
+
+    def test_skip_when_no_api_key(self):
+        """Test that preflight check is skipped when LLM_API_KEY is not set."""
+        models = [{"display_name": "Test", "llm_config": {"model": "test"}}]
+
+        with patch.dict("os.environ", {}, clear=True):
+            result = run_preflight_check(models)
+
+        assert result is True  # Skipped = success
+
+    def test_skip_when_skip_preflight_true(self):
+        """Test that preflight check is skipped when SKIP_PREFLIGHT=true."""
+        models = [{"display_name": "Test", "llm_config": {"model": "test"}}]
+
+        with patch.dict(
+            "os.environ", {"LLM_API_KEY": "test", "SKIP_PREFLIGHT": "true"}
+        ):
+            result = run_preflight_check(models)
+
+        assert result is True  # Skipped = success
+
+    def test_all_models_pass(self):
+        """Test that preflight check returns True when all models pass."""
+        models = [
+            {"display_name": "Model A", "llm_config": {"model": "model-a"}},
+            {"display_name": "Model B", "llm_config": {"model": "model-b"}},
+        ]
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock(message=MagicMock(content="OK"))]
+
+        with patch.dict("os.environ", {"LLM_API_KEY": "test"}):
+            with patch(
+                "resolve_model_config.litellm.completion", return_value=mock_response
+            ):
+                result = run_preflight_check(models)
+
+        assert result is True
+
+    def test_any_model_fails(self):
+        """Test that preflight check returns False when any model fails."""
+        models = [
+            {"display_name": "Model A", "llm_config": {"model": "model-a"}},
+            {"display_name": "Model B", "llm_config": {"model": "model-b"}},
+        ]
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock(message=MagicMock(content="OK"))]
+
+        def mock_completion(**kwargs):
+            if kwargs["model"] == "model-b":
+                raise Exception("Model B failed")
+            return mock_response
+
+        with patch.dict("os.environ", {"LLM_API_KEY": "test"}):
+            with patch(
+                "resolve_model_config.litellm.completion", side_effect=mock_completion
+            ):
+                result = run_preflight_check(models)
+
+        assert result is False