@@ -325,6 +325,82 @@ async def test_evaluate_success(
325325 assert mock_eval_set_results_manager .save_eval_set_result .call_count == 2
326326
327327
328+ @pytest .mark .asyncio
329+ async def test_evaluate_skips_failed_inference_results (
330+ eval_service , mock_eval_sets_manager , mock_eval_set_results_manager , mocker
331+ ):
332+ invocation = Invocation (
333+ user_content = genai_types .Content (
334+ parts = [genai_types .Part (text = "test user content." )]
335+ ),
336+ final_response = genai_types .Content (
337+ parts = [genai_types .Part (text = "test final response." )]
338+ ),
339+ )
340+ inference_results = [
341+ InferenceResult (
342+ app_name = "test_app" ,
343+ eval_set_id = "test_eval_set" ,
344+ eval_case_id = "case_failure" ,
345+ inferences = None ,
346+ session_id = "session_fail" ,
347+ status = InferenceStatus .FAILURE ,
348+ error_message = "simulated failure" ,
349+ ),
350+ InferenceResult (
351+ app_name = "test_app" ,
352+ eval_set_id = "test_eval_set" ,
353+ eval_case_id = "case_success" ,
354+ inferences = [invocation .model_copy (deep = True )],
355+ session_id = "session_success" ,
356+ status = InferenceStatus .SUCCESS ,
357+ ),
358+ InferenceResult (
359+ app_name = "test_app" ,
360+ eval_set_id = "test_eval_set" ,
361+ eval_case_id = "case_unknown" ,
362+ inferences = [invocation .model_copy (deep = True )],
363+ session_id = "session_unknown" ,
364+ status = InferenceStatus .UNKNOWN ,
365+ ),
366+ ]
367+ eval_metric = EvalMetric (metric_name = "fake_metric" , threshold = 0.5 )
368+ evaluate_request = EvaluateRequest (
369+ inference_results = inference_results ,
370+ evaluate_config = EvaluateConfig (eval_metrics = [eval_metric ], parallelism = 2 ),
371+ )
372+
373+ mock_eval_case = mocker .MagicMock (spec = EvalCase )
374+ mock_eval_case .conversation = [invocation .model_copy (deep = True )]
375+ mock_eval_case .conversation_scenario = None
376+ mock_eval_case .session_input = None
377+ mock_eval_sets_manager .get_eval_case .return_value = mock_eval_case
378+
379+ results = []
380+ async for result in eval_service .evaluate (evaluate_request ):
381+ results .append (result )
382+
383+ assert len (results ) == 3
384+ results_by_case = {result .eval_id : result for result in results }
385+
386+ failure_result = results_by_case ["case_failure" ]
387+ assert failure_result .final_eval_status == EvalStatus .NOT_EVALUATED
388+ assert failure_result .overall_eval_metric_results == []
389+ assert failure_result .eval_metric_result_per_invocation == []
390+
391+ for case_id in ["case_success" , "case_unknown" ]:
392+ case_result = results_by_case [case_id ]
393+ assert case_result .final_eval_status == EvalStatus .PASSED
394+ assert len (case_result .overall_eval_metric_results ) == 1
395+ assert (
396+ case_result .overall_eval_metric_results [0 ].metric_name == "fake_metric"
397+ )
398+ assert case_result .overall_eval_metric_results [0 ].score == 0.9
399+
400+ assert mock_eval_sets_manager .get_eval_case .call_count == 3
401+ assert mock_eval_set_results_manager .save_eval_set_result .call_count == 3
402+
403+
328404@pytest .mark .asyncio
329405async def test_evaluate_eval_case_not_found (
330406 eval_service ,
@@ -418,6 +494,93 @@ async def test_evaluate_single_inference_result(
418494 assert metric_result .eval_status == EvalStatus .PASSED
419495
420496
497+ @pytest .mark .asyncio
498+ async def test_evaluate_single_inference_result_handles_failed_inference (
499+ eval_service , mock_eval_sets_manager , mocker
500+ ):
501+ invocation = Invocation (
502+ user_content = genai_types .Content (
503+ parts = [genai_types .Part (text = "test user content." )]
504+ ),
505+ final_response = genai_types .Content (
506+ parts = [genai_types .Part (text = "test final response." )]
507+ ),
508+ )
509+ inference_result = InferenceResult (
510+ app_name = "test_app" ,
511+ eval_set_id = "test_eval_set" ,
512+ eval_case_id = "case1" ,
513+ inferences = None ,
514+ session_id = "session1" ,
515+ status = InferenceStatus .FAILURE ,
516+ error_message = "simulated inference failure" ,
517+ )
518+ eval_metric = EvalMetric (metric_name = "fake_metric" , threshold = 0.5 )
519+ evaluate_config = EvaluateConfig (eval_metrics = [eval_metric ], parallelism = 1 )
520+
521+ mock_eval_case = mocker .MagicMock (spec = EvalCase )
522+ mock_eval_case .conversation = [invocation .model_copy (deep = True )]
523+ mock_eval_case .conversation_scenario = None
524+ mock_eval_case .session_input = None
525+ mock_eval_sets_manager .get_eval_case .return_value = mock_eval_case
526+
527+ _ , result = await eval_service ._evaluate_single_inference_result (
528+ inference_result = inference_result , evaluate_config = evaluate_config
529+ )
530+
531+ assert isinstance (result , EvalCaseResult )
532+ assert result .eval_id == "case1"
533+ assert result .final_eval_status == EvalStatus .NOT_EVALUATED
534+ assert result .overall_eval_metric_results == []
535+ assert result .eval_metric_result_per_invocation == []
536+ mock_eval_sets_manager .get_eval_case .assert_called_once_with (
537+ app_name = "test_app" , eval_set_id = "test_eval_set" , eval_case_id = "case1"
538+ )
539+
540+
541+ @pytest .mark .asyncio
542+ async def test_evaluate_single_inference_result_handles_missing_inferences (
543+ eval_service , mock_eval_sets_manager , mocker
544+ ):
545+ invocation = Invocation (
546+ user_content = genai_types .Content (
547+ parts = [genai_types .Part (text = "test user content." )]
548+ ),
549+ final_response = genai_types .Content (
550+ parts = [genai_types .Part (text = "test final response." )]
551+ ),
552+ )
553+ inference_result = InferenceResult (
554+ app_name = "test_app" ,
555+ eval_set_id = "test_eval_set" ,
556+ eval_case_id = "case1" ,
557+ inferences = None ,
558+ session_id = "session1" ,
559+ status = InferenceStatus .SUCCESS ,
560+ )
561+ eval_metric = EvalMetric (metric_name = "fake_metric" , threshold = 0.5 )
562+ evaluate_config = EvaluateConfig (eval_metrics = [eval_metric ], parallelism = 1 )
563+
564+ mock_eval_case = mocker .MagicMock (spec = EvalCase )
565+ mock_eval_case .conversation = [invocation .model_copy (deep = True )]
566+ mock_eval_case .conversation_scenario = None
567+ mock_eval_case .session_input = None
568+ mock_eval_sets_manager .get_eval_case .return_value = mock_eval_case
569+
570+ _ , result = await eval_service ._evaluate_single_inference_result (
571+ inference_result = inference_result , evaluate_config = evaluate_config
572+ )
573+
574+ assert isinstance (result , EvalCaseResult )
575+ assert result .eval_id == "case1"
576+ assert result .final_eval_status == EvalStatus .NOT_EVALUATED
577+ assert result .overall_eval_metric_results == []
578+ assert result .eval_metric_result_per_invocation == []
579+ mock_eval_sets_manager .get_eval_case .assert_called_once_with (
580+ app_name = "test_app" , eval_set_id = "test_eval_set" , eval_case_id = "case1"
581+ )
582+
583+
421584@pytest .mark .asyncio
422585async def test_evaluate_single_inference_result_for_conversation_scenario (
423586 eval_service , mock_eval_sets_manager , mocker
0 commit comments