Skip to content

Commit 7d4326c

Browse files
Handle None inferences in eval results for issue #2729 (#3805)
* fixed CR comments * formatted via isort --------- Co-authored-by: Ankur <[email protected]>
1 parent c222a45 commit 7d4326c

File tree

2 files changed

+204
-0
lines changed

2 files changed

+204
-0
lines changed

src/google/adk/evaluation/local_eval_service.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,22 @@ async def _evaluate_single_inference_result(
268268
else 'test_user_id'
269269
)
270270

271+
if (
272+
inference_result.status == InferenceStatus.FAILURE
273+
or inference_result.inferences is None
274+
):
275+
logger.error(
276+
'Evaluation attempted on failed inference for eval case `%s`.'
277+
' Error: %s',
278+
inference_result.eval_case_id,
279+
inference_result.error_message,
280+
)
281+
eval_case_result = await self._build_not_evaluated_eval_case_result(
282+
inference_result=inference_result,
283+
user_id=user_id,
284+
)
285+
return (inference_result, eval_case_result)
286+
271287
if eval_case.conversation_scenario is None and len(
272288
inference_result.inferences
273289
) != len(eval_case.conversation):
@@ -464,6 +480,31 @@ def _generate_final_eval_status(
464480

465481
return final_eval_status
466482

483+
async def _build_not_evaluated_eval_case_result(
484+
self,
485+
*,
486+
inference_result: InferenceResult,
487+
user_id: str,
488+
) -> EvalCaseResult:
489+
"""Constructs an EvalCaseResult for cases that could not be evaluated."""
490+
session_details = await self._session_service.get_session(
491+
app_name=inference_result.app_name,
492+
user_id=user_id,
493+
session_id=inference_result.session_id,
494+
)
495+
496+
return EvalCaseResult(
497+
eval_set_file=inference_result.eval_set_id,
498+
eval_set_id=inference_result.eval_set_id,
499+
eval_id=inference_result.eval_case_id,
500+
final_eval_status=EvalStatus.NOT_EVALUATED,
501+
overall_eval_metric_results=[],
502+
eval_metric_result_per_invocation=[],
503+
session_id=inference_result.session_id,
504+
session_details=session_details,
505+
user_id=user_id,
506+
)
507+
467508
async def _perform_inference_single_eval_item(
468509
self,
469510
app_name: str,

tests/unittests/evaluation/test_local_eval_service.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,82 @@ async def test_evaluate_success(
325325
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 2
326326

327327

328+
@pytest.mark.asyncio
329+
async def test_evaluate_skips_failed_inference_results(
330+
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
331+
):
332+
invocation = Invocation(
333+
user_content=genai_types.Content(
334+
parts=[genai_types.Part(text="test user content.")]
335+
),
336+
final_response=genai_types.Content(
337+
parts=[genai_types.Part(text="test final response.")]
338+
),
339+
)
340+
inference_results = [
341+
InferenceResult(
342+
app_name="test_app",
343+
eval_set_id="test_eval_set",
344+
eval_case_id="case_failure",
345+
inferences=None,
346+
session_id="session_fail",
347+
status=InferenceStatus.FAILURE,
348+
error_message="simulated failure",
349+
),
350+
InferenceResult(
351+
app_name="test_app",
352+
eval_set_id="test_eval_set",
353+
eval_case_id="case_success",
354+
inferences=[invocation.model_copy(deep=True)],
355+
session_id="session_success",
356+
status=InferenceStatus.SUCCESS,
357+
),
358+
InferenceResult(
359+
app_name="test_app",
360+
eval_set_id="test_eval_set",
361+
eval_case_id="case_unknown",
362+
inferences=[invocation.model_copy(deep=True)],
363+
session_id="session_unknown",
364+
status=InferenceStatus.UNKNOWN,
365+
),
366+
]
367+
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
368+
evaluate_request = EvaluateRequest(
369+
inference_results=inference_results,
370+
evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2),
371+
)
372+
373+
mock_eval_case = mocker.MagicMock(spec=EvalCase)
374+
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
375+
mock_eval_case.conversation_scenario = None
376+
mock_eval_case.session_input = None
377+
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
378+
379+
results = []
380+
async for result in eval_service.evaluate(evaluate_request):
381+
results.append(result)
382+
383+
assert len(results) == 3
384+
results_by_case = {result.eval_id: result for result in results}
385+
386+
failure_result = results_by_case["case_failure"]
387+
assert failure_result.final_eval_status == EvalStatus.NOT_EVALUATED
388+
assert failure_result.overall_eval_metric_results == []
389+
assert failure_result.eval_metric_result_per_invocation == []
390+
391+
for case_id in ["case_success", "case_unknown"]:
392+
case_result = results_by_case[case_id]
393+
assert case_result.final_eval_status == EvalStatus.PASSED
394+
assert len(case_result.overall_eval_metric_results) == 1
395+
assert (
396+
case_result.overall_eval_metric_results[0].metric_name == "fake_metric"
397+
)
398+
assert case_result.overall_eval_metric_results[0].score == 0.9
399+
400+
assert mock_eval_sets_manager.get_eval_case.call_count == 3
401+
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 3
402+
403+
328404
@pytest.mark.asyncio
329405
async def test_evaluate_eval_case_not_found(
330406
eval_service,
@@ -418,6 +494,93 @@ async def test_evaluate_single_inference_result(
418494
assert metric_result.eval_status == EvalStatus.PASSED
419495

420496

497+
@pytest.mark.asyncio
498+
async def test_evaluate_single_inference_result_handles_failed_inference(
499+
eval_service, mock_eval_sets_manager, mocker
500+
):
501+
invocation = Invocation(
502+
user_content=genai_types.Content(
503+
parts=[genai_types.Part(text="test user content.")]
504+
),
505+
final_response=genai_types.Content(
506+
parts=[genai_types.Part(text="test final response.")]
507+
),
508+
)
509+
inference_result = InferenceResult(
510+
app_name="test_app",
511+
eval_set_id="test_eval_set",
512+
eval_case_id="case1",
513+
inferences=None,
514+
session_id="session1",
515+
status=InferenceStatus.FAILURE,
516+
error_message="simulated inference failure",
517+
)
518+
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
519+
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
520+
521+
mock_eval_case = mocker.MagicMock(spec=EvalCase)
522+
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
523+
mock_eval_case.conversation_scenario = None
524+
mock_eval_case.session_input = None
525+
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
526+
527+
_, result = await eval_service._evaluate_single_inference_result(
528+
inference_result=inference_result, evaluate_config=evaluate_config
529+
)
530+
531+
assert isinstance(result, EvalCaseResult)
532+
assert result.eval_id == "case1"
533+
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
534+
assert result.overall_eval_metric_results == []
535+
assert result.eval_metric_result_per_invocation == []
536+
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
537+
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
538+
)
539+
540+
541+
@pytest.mark.asyncio
542+
async def test_evaluate_single_inference_result_handles_missing_inferences(
543+
eval_service, mock_eval_sets_manager, mocker
544+
):
545+
invocation = Invocation(
546+
user_content=genai_types.Content(
547+
parts=[genai_types.Part(text="test user content.")]
548+
),
549+
final_response=genai_types.Content(
550+
parts=[genai_types.Part(text="test final response.")]
551+
),
552+
)
553+
inference_result = InferenceResult(
554+
app_name="test_app",
555+
eval_set_id="test_eval_set",
556+
eval_case_id="case1",
557+
inferences=None,
558+
session_id="session1",
559+
status=InferenceStatus.SUCCESS,
560+
)
561+
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
562+
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
563+
564+
mock_eval_case = mocker.MagicMock(spec=EvalCase)
565+
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
566+
mock_eval_case.conversation_scenario = None
567+
mock_eval_case.session_input = None
568+
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
569+
570+
_, result = await eval_service._evaluate_single_inference_result(
571+
inference_result=inference_result, evaluate_config=evaluate_config
572+
)
573+
574+
assert isinstance(result, EvalCaseResult)
575+
assert result.eval_id == "case1"
576+
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
577+
assert result.overall_eval_metric_results == []
578+
assert result.eval_metric_result_per_invocation == []
579+
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
580+
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
581+
)
582+
583+
421584
@pytest.mark.asyncio
422585
async def test_evaluate_single_inference_result_for_conversation_scenario(
423586
eval_service, mock_eval_sets_manager, mocker

0 commit comments

Comments
 (0)