Skip to content

Commit 0eb9b34

Browse files
authored
Fix/sync evals api version retry (Azure#44780)
* [evaluation] Update sync evals API version and add retry policy - Update api_version from 2025-10-15-preview to 2025-11-15-preview - Add get_sync_http_client_with_retry() helper with explicit retry config - Configure retries on 5XX errors (500, 502, 503, 504) and 429 rate limits - Azure Core RetryPolicy automatically honors Retry-After headers on 429 * [evaluation] Move _use_legacy_endpoint to kwargs in RedTeam Make _use_legacy_endpoint an internal parameter by extracting it from kwargs instead of having it as an explicit parameter in the signature. * [evaluation] Update test recordings for sync evals API version change Re-recorded e2e tests for content safety and multimodal evaluators to use the updated 2025-11-15-preview API version. * [evaluation] Update test_mass_evaluate.py assertion for new evaluation_per_turn fields Update expected column count from 88 to 98 to account for new evaluation_per_turn fields added for the sexual evaluator. * [evaluation] Update test mock to use get_sync_http_client_with_retry Update the test mock to directly target the new function name for better test clarity and accuracy as suggested in code review. * [evaluation] Update test recordings for sync_evals API version change Re-recorded e2e tests affected by API version change from 2025-10-15-preview to 2025-11-15-preview: - test_code_vulnerability_evaluator - test_ungrounded_attributes_evaluator - test_protected_material_evaluator - test_eci_evaluator - test_xpia_evaluator - test_groundedness_pro_evaluator - TestUserAgent::test_rai_service_evaluator (10 evaluator variants)
1 parent e21a550 commit 0eb9b34

File tree

5 files changed

+27
-12
lines changed

5 files changed

+27
-12
lines changed

sdk/evaluation/azure-ai-evaluation/assets.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"AssetsRepo": "Azure/azure-sdk-assets",
33
"AssetsRepoPrefixPath": "python",
44
"TagPrefix": "python/evaluation/azure-ai-evaluation",
5-
"Tag": "python/evaluation/azure-ai-evaluation_ea43da2b11"
5+
"Tag": "python/evaluation/azure-ai-evaluation_409699f40b"
66
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from azure.ai.evaluation._common.utils import is_onedp_project
3030
from azure.core.credentials import TokenCredential
3131
from azure.core.exceptions import HttpResponseError
32-
from azure.core.pipeline.policies import AsyncRetryPolicy, UserAgentPolicy
32+
from azure.core.pipeline.policies import AsyncRetryPolicy, RetryPolicy, UserAgentPolicy
3333

3434
from .constants import (
3535
CommonConstants,
@@ -118,6 +118,22 @@ def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
118118
)
119119

120120

121+
def get_sync_http_client_with_retry():
122+
"""Get an HTTP client configured with retry policy for sync evals.
123+
124+
Configures retries on 5XX server errors and honors Retry-After headers on 429 responses.
125+
"""
126+
# Configure retry policy with explicit settings for transient errors
127+
# retry_on_status_codes includes 429 (rate limit) and 5XX server errors
128+
retry_policy = RetryPolicy(
129+
retry_total=3,
130+
retry_on_status_codes=[429, 500, 502, 503, 504],
131+
retry_backoff_factor=0.8,
132+
retry_backoff_max=120,
133+
)
134+
return get_http_client(retry_policy=retry_policy)
135+
136+
121137
async def ensure_service_availability_onedp(
122138
client: AIProjectClient, token: str, capability: Optional[str] = None
123139
) -> None:
@@ -1042,7 +1058,7 @@ async def evaluate_with_rai_service_sync(
10421058
)
10431059

10441060
# Sync evals endpoint implementation (default)
1045-
api_version = "2025-10-15-preview"
1061+
api_version = "2025-11-15-preview"
10461062
if not is_onedp_project(project_scope):
10471063
# Get RAI service URL from discovery service and check service availability
10481064
token = await fetch_or_reuse_token(credential)
@@ -1055,7 +1071,7 @@ async def evaluate_with_rai_service_sync(
10551071
sync_eval_payload = _build_sync_eval_payload(data, metric_name, annotation_task, scan_session_id)
10561072
sync_eval_payload_json = json.dumps(sync_eval_payload, cls=SdkJSONEncoder)
10571073

1058-
with get_http_client() as client:
1074+
with get_sync_http_client_with_retry() as client:
10591075
http_response = client.post(url, data=sync_eval_payload_json, headers=headers)
10601076

10611077
if http_response.status_code != 200:
@@ -1255,7 +1271,7 @@ async def evaluate_with_rai_service_sync_multimodal(
12551271
)
12561272

12571273
# Sync evals endpoint implementation (default)
1258-
api_version = "2025-10-15-preview"
1274+
api_version = "2025-11-15-preview"
12591275
sync_eval_payload = _build_sync_eval_multimodal_payload(messages, metric_name)
12601276

12611277
if is_onedp_project(project_scope):
@@ -1285,7 +1301,7 @@ async def evaluate_with_rai_service_sync_multimodal(
12851301

12861302
sync_eval_payload_json = json.dumps(sync_eval_payload, cls=SdkJSONEncoder)
12871303

1288-
with get_http_client() as client:
1304+
with get_sync_http_client_with_retry() as client:
12891305
http_response = client.post(url, data=sync_eval_payload_json, headers=headers)
12901306

12911307
if http_response.status_code != 200:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def __init__(
126126
language: SupportedLanguages = SupportedLanguages.English,
127127
output_dir=".",
128128
attack_success_thresholds: Optional[Dict[RiskCategory, int]] = None,
129-
_use_legacy_endpoint: bool = False,
129+
**kwargs,
130130
):
131131
"""Initialize a new Red Team agent for AI model evaluation.
132132
@@ -156,16 +156,14 @@ def __init__(
156156
or None to use default binary evaluation (evaluation results determine success).
157157
When using thresholds, scores >= threshold are considered successful attacks.
158158
:type attack_success_thresholds: Optional[Dict[RiskCategory, int]]
159-
:param _use_legacy_endpoint: Whether to use the legacy evaluation endpoint. Defaults to False.
160-
:type _use_legacy_endpoint: bool
161159
"""
162160

163161
self.azure_ai_project = validate_azure_ai_project(azure_ai_project)
164162
self.credential = credential
165163
self.output_dir = output_dir
166164
self.language = language
167165
self._one_dp_project = is_onedp_project(azure_ai_project)
168-
self._use_legacy_endpoint = _use_legacy_endpoint
166+
self._use_legacy_endpoint = kwargs.get("_use_legacy_endpoint", False)
169167

170168
# Configure attack success thresholds
171169
self.attack_success_thresholds = self._configure_attack_success_thresholds(attack_success_thresholds)

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,8 @@ def test_evaluate_multimodal(
388388
# imageurls_with_target has 1 extra column: outputs.conversation due to the target mapping
389389
assert len(row_result_df.keys()) >= 33
390390
else:
391-
assert len(row_result_df.keys()) == 88
391+
# 98 columns: includes evaluation_per_turn fields for all evaluators
392+
assert len(row_result_df.keys()) == 98
392393
known_keys = [
393394
"outputs.content_safety.hate_unfairness",
394395
"outputs.content_safety.hate_unfairness_score",

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ async def test_get_rai_svc_url(self, client_mock, discovery_mock):
399399
@patch("azure.ai.evaluation._common.rai_service.fetch_or_reuse_token")
400400
@patch("azure.ai.evaluation._common.rai_service.get_rai_svc_url")
401401
@patch("azure.ai.evaluation._common.rai_service.ensure_service_availability")
402-
@patch("azure.ai.evaluation._common.rai_service.get_http_client")
402+
@patch("azure.ai.evaluation._common.rai_service.get_sync_http_client_with_retry")
403403
async def test_evaluate_with_rai_service_sync(
404404
self, http_client_mock, ensure_avail_mock, get_url_mock, fetch_token_mock, cred_mock
405405
):

0 commit comments

Comments
 (0)