Fix/sync evals api version retry (Azure#44780)

slister1001 · web-flow · commit 0eb9b34bf1d2 · 2026-01-21T10:53:33.000-08:00
* [evaluation] Update sync evals API version and add retry policy

- Update api_version from 2025-10-15-preview to 2025-11-15-preview
- Add get_sync_http_client_with_retry() helper with explicit retry config
- Configure retries on 5XX errors (500, 502, 503, 504) and 429 rate limits
- Azure Core RetryPolicy automatically honors Retry-After headers on 429

* [evaluation] Move _use_legacy_endpoint to kwargs in RedTeam

Make _use_legacy_endpoint an internal parameter by extracting it from
kwargs instead of having it as an explicit parameter in the signature.

* [evaluation] Update test recordings for sync evals API version change

Re-recorded e2e tests for content safety and multimodal evaluators
to use the updated 2025-11-15-preview API version.

* [evaluation] Update test_mass_evaluate.py assertion for new evaluation_per_turn fields

Update expected column count from 88 to 98 to account for new
evaluation_per_turn fields added for the sexual evaluator.

* [evaluation] Update test mock to use get_sync_http_client_with_retry

Update the test mock to directly target the new function name for
better test clarity and accuracy as suggested in code review.

* [evaluation] Update test recordings for sync_evals API version change

Re-recorded e2e tests affected by API version change from 2025-10-15-preview
to 2025-11-15-preview:
- test_code_vulnerability_evaluator
- test_ungrounded_attributes_evaluator
- test_protected_material_evaluator
- test_eci_evaluator
- test_xpia_evaluator
- test_groundedness_pro_evaluator
- TestUserAgent::test_rai_service_evaluator (10 evaluator variants)
diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_ea43da2b11"
+  "Tag": "python/evaluation/azure-ai-evaluation_409699f40b"
 }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py
@@ -29,7 +29,7 @@
 from azure.ai.evaluation._common.utils import is_onedp_project
 from azure.core.credentials import TokenCredential
 from azure.core.exceptions import HttpResponseError
-from azure.core.pipeline.policies import AsyncRetryPolicy, UserAgentPolicy
+from azure.core.pipeline.policies import AsyncRetryPolicy, RetryPolicy, UserAgentPolicy
 
 from .constants import (
     CommonConstants,
@@ -118,6 +118,22 @@ def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
     )
 
 
+def get_sync_http_client_with_retry():
+    """Get an HTTP client configured with retry policy for sync evals.
+
+    Configures retries on 5XX server errors and honors Retry-After headers on 429 responses.
+    """
+    # Configure retry policy with explicit settings for transient errors
+    # retry_on_status_codes includes 429 (rate limit) and 5XX server errors
+    retry_policy = RetryPolicy(
+        retry_total=3,
+        retry_on_status_codes=[429, 500, 502, 503, 504],
+        retry_backoff_factor=0.8,
+        retry_backoff_max=120,
+    )
+    return get_http_client(retry_policy=retry_policy)
+
+
 async def ensure_service_availability_onedp(
     client: AIProjectClient, token: str, capability: Optional[str] = None
 ) -> None:
@@ -1042,7 +1058,7 @@ async def evaluate_with_rai_service_sync(
         )
 
     # Sync evals endpoint implementation (default)
-    api_version = "2025-10-15-preview"
+    api_version = "2025-11-15-preview"
     if not is_onedp_project(project_scope):
         # Get RAI service URL from discovery service and check service availability
         token = await fetch_or_reuse_token(credential)
@@ -1055,7 +1071,7 @@ async def evaluate_with_rai_service_sync(
         sync_eval_payload = _build_sync_eval_payload(data, metric_name, annotation_task, scan_session_id)
         sync_eval_payload_json = json.dumps(sync_eval_payload, cls=SdkJSONEncoder)
 
-        with get_http_client() as client:
+        with get_sync_http_client_with_retry() as client:
             http_response = client.post(url, data=sync_eval_payload_json, headers=headers)
 
         if http_response.status_code != 200:
@@ -1255,7 +1271,7 @@ async def evaluate_with_rai_service_sync_multimodal(
         )
 
     # Sync evals endpoint implementation (default)
-    api_version = "2025-10-15-preview"
+    api_version = "2025-11-15-preview"
     sync_eval_payload = _build_sync_eval_multimodal_payload(messages, metric_name)
 
     if is_onedp_project(project_scope):
@@ -1285,7 +1301,7 @@ async def evaluate_with_rai_service_sync_multimodal(
 
     sync_eval_payload_json = json.dumps(sync_eval_payload, cls=SdkJSONEncoder)
 
-    with get_http_client() as client:
+    with get_sync_http_client_with_retry() as client:
         http_response = client.post(url, data=sync_eval_payload_json, headers=headers)
 
     if http_response.status_code != 200:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py
@@ -126,7 +126,7 @@ def __init__(
         language: SupportedLanguages = SupportedLanguages.English,
         output_dir=".",
         attack_success_thresholds: Optional[Dict[RiskCategory, int]] = None,
-        _use_legacy_endpoint: bool = False,
+        **kwargs,
     ):
         """Initialize a new Red Team agent for AI model evaluation.
 
@@ -156,16 +156,14 @@ def __init__(
             or None to use default binary evaluation (evaluation results determine success).
             When using thresholds, scores >= threshold are considered successful attacks.
         :type attack_success_thresholds: Optional[Dict[RiskCategory, int]]
-        :param _use_legacy_endpoint: Whether to use the legacy evaluation endpoint. Defaults to False.
-        :type _use_legacy_endpoint: bool
         """
 
         self.azure_ai_project = validate_azure_ai_project(azure_ai_project)
         self.credential = credential
         self.output_dir = output_dir
         self.language = language
         self._one_dp_project = is_onedp_project(azure_ai_project)
-        self._use_legacy_endpoint = _use_legacy_endpoint
+        self._use_legacy_endpoint = kwargs.get("_use_legacy_endpoint", False)
 
         # Configure attack success thresholds
         self.attack_success_thresholds = self._configure_attack_success_thresholds(attack_success_thresholds)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py
@@ -388,7 +388,8 @@ def test_evaluate_multimodal(
             # imageurls_with_target has 1 extra column: outputs.conversation due to the target mapping
             assert len(row_result_df.keys()) >= 33
         else:
-            assert len(row_result_df.keys()) == 88
+            # 98 columns: includes evaluation_per_turn fields for all evaluators
+            assert len(row_result_df.keys()) == 98
         known_keys = [
             "outputs.content_safety.hate_unfairness",
             "outputs.content_safety.hate_unfairness_score",
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py
@@ -399,7 +399,7 @@ async def test_get_rai_svc_url(self, client_mock, discovery_mock):
     @patch("azure.ai.evaluation._common.rai_service.fetch_or_reuse_token")
     @patch("azure.ai.evaluation._common.rai_service.get_rai_svc_url")
     @patch("azure.ai.evaluation._common.rai_service.ensure_service_availability")
-    @patch("azure.ai.evaluation._common.rai_service.get_http_client")
+    @patch("azure.ai.evaluation._common.rai_service.get_sync_http_client_with_retry")
     async def test_evaluate_with_rai_service_sync(
         self, http_client_mock, ensure_avail_mock, get_url_mock, fetch_token_mock, cred_mock
     ):

Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,5 @@`
`2`	`2`	`"AssetsRepo": "Azure/azure-sdk-assets",`
`3`	`3`	`"AssetsRepoPrefixPath": "python",`
`4`	`4`	`"TagPrefix": "python/evaluation/azure-ai-evaluation",`
`5`		`- "Tag": "python/evaluation/azure-ai-evaluation_ea43da2b11"`
	`5`	`+ "Tag": "python/evaluation/azure-ai-evaluation_409699f40b"`
`6`	`6`	`}`