@@ -325,82 +325,6 @@ async def test_evaluate_success(
325325 assert mock_eval_set_results_manager .save_eval_set_result .call_count == 2
326326
327327
328- @pytest .mark .asyncio
329- async def test_evaluate_skips_failed_inference_results (
330- eval_service , mock_eval_sets_manager , mock_eval_set_results_manager , mocker
331- ):
332- invocation = Invocation (
333- user_content = genai_types .Content (
334- parts = [genai_types .Part (text = "test user content." )]
335- ),
336- final_response = genai_types .Content (
337- parts = [genai_types .Part (text = "test final response." )]
338- ),
339- )
340- inference_results = [
341- InferenceResult (
342- app_name = "test_app" ,
343- eval_set_id = "test_eval_set" ,
344- eval_case_id = "case_failure" ,
345- inferences = None ,
346- session_id = "session_fail" ,
347- status = InferenceStatus .FAILURE ,
348- error_message = "simulated failure" ,
349- ),
350- InferenceResult (
351- app_name = "test_app" ,
352- eval_set_id = "test_eval_set" ,
353- eval_case_id = "case_success" ,
354- inferences = [invocation .model_copy (deep = True )],
355- session_id = "session_success" ,
356- status = InferenceStatus .SUCCESS ,
357- ),
358- InferenceResult (
359- app_name = "test_app" ,
360- eval_set_id = "test_eval_set" ,
361- eval_case_id = "case_unknown" ,
362- inferences = [invocation .model_copy (deep = True )],
363- session_id = "session_unknown" ,
364- status = InferenceStatus .UNKNOWN ,
365- ),
366- ]
367- eval_metric = EvalMetric (metric_name = "fake_metric" , threshold = 0.5 )
368- evaluate_request = EvaluateRequest (
369- inference_results = inference_results ,
370- evaluate_config = EvaluateConfig (eval_metrics = [eval_metric ], parallelism = 2 ),
371- )
372-
373- mock_eval_case = mocker .MagicMock (spec = EvalCase )
374- mock_eval_case .conversation = [invocation .model_copy (deep = True )]
375- mock_eval_case .conversation_scenario = None
376- mock_eval_case .session_input = None
377- mock_eval_sets_manager .get_eval_case .return_value = mock_eval_case
378-
379- results = []
380- async for result in eval_service .evaluate (evaluate_request ):
381- results .append (result )
382-
383- assert len (results ) == 3
384- results_by_case = {result .eval_id : result for result in results }
385-
386- failure_result = results_by_case ["case_failure" ]
387- assert failure_result .final_eval_status == EvalStatus .NOT_EVALUATED
388- assert failure_result .overall_eval_metric_results == []
389- assert failure_result .eval_metric_result_per_invocation == []
390-
391- for case_id in ["case_success" , "case_unknown" ]:
392- case_result = results_by_case [case_id ]
393- assert case_result .final_eval_status == EvalStatus .PASSED
394- assert len (case_result .overall_eval_metric_results ) == 1
395- assert (
396- case_result .overall_eval_metric_results [0 ].metric_name == "fake_metric"
397- )
398- assert case_result .overall_eval_metric_results [0 ].score == 0.9
399-
400- assert mock_eval_sets_manager .get_eval_case .call_count == 3
401- assert mock_eval_set_results_manager .save_eval_set_result .call_count == 3
402-
403-
404328@pytest .mark .asyncio
405329async def test_evaluate_eval_case_not_found (
406330 eval_service ,
@@ -494,93 +418,6 @@ async def test_evaluate_single_inference_result(
494418 assert metric_result .eval_status == EvalStatus .PASSED
495419
496420
497- @pytest .mark .asyncio
498- async def test_evaluate_single_inference_result_handles_failed_inference (
499- eval_service , mock_eval_sets_manager , mocker
500- ):
501- invocation = Invocation (
502- user_content = genai_types .Content (
503- parts = [genai_types .Part (text = "test user content." )]
504- ),
505- final_response = genai_types .Content (
506- parts = [genai_types .Part (text = "test final response." )]
507- ),
508- )
509- inference_result = InferenceResult (
510- app_name = "test_app" ,
511- eval_set_id = "test_eval_set" ,
512- eval_case_id = "case1" ,
513- inferences = None ,
514- session_id = "session1" ,
515- status = InferenceStatus .FAILURE ,
516- error_message = "simulated inference failure" ,
517- )
518- eval_metric = EvalMetric (metric_name = "fake_metric" , threshold = 0.5 )
519- evaluate_config = EvaluateConfig (eval_metrics = [eval_metric ], parallelism = 1 )
520-
521- mock_eval_case = mocker .MagicMock (spec = EvalCase )
522- mock_eval_case .conversation = [invocation .model_copy (deep = True )]
523- mock_eval_case .conversation_scenario = None
524- mock_eval_case .session_input = None
525- mock_eval_sets_manager .get_eval_case .return_value = mock_eval_case
526-
527- _ , result = await eval_service ._evaluate_single_inference_result (
528- inference_result = inference_result , evaluate_config = evaluate_config
529- )
530-
531- assert isinstance (result , EvalCaseResult )
532- assert result .eval_id == "case1"
533- assert result .final_eval_status == EvalStatus .NOT_EVALUATED
534- assert result .overall_eval_metric_results == []
535- assert result .eval_metric_result_per_invocation == []
536- mock_eval_sets_manager .get_eval_case .assert_called_once_with (
537- app_name = "test_app" , eval_set_id = "test_eval_set" , eval_case_id = "case1"
538- )
539-
540-
541- @pytest .mark .asyncio
542- async def test_evaluate_single_inference_result_handles_missing_inferences (
543- eval_service , mock_eval_sets_manager , mocker
544- ):
545- invocation = Invocation (
546- user_content = genai_types .Content (
547- parts = [genai_types .Part (text = "test user content." )]
548- ),
549- final_response = genai_types .Content (
550- parts = [genai_types .Part (text = "test final response." )]
551- ),
552- )
553- inference_result = InferenceResult (
554- app_name = "test_app" ,
555- eval_set_id = "test_eval_set" ,
556- eval_case_id = "case1" ,
557- inferences = None ,
558- session_id = "session1" ,
559- status = InferenceStatus .SUCCESS ,
560- )
561- eval_metric = EvalMetric (metric_name = "fake_metric" , threshold = 0.5 )
562- evaluate_config = EvaluateConfig (eval_metrics = [eval_metric ], parallelism = 1 )
563-
564- mock_eval_case = mocker .MagicMock (spec = EvalCase )
565- mock_eval_case .conversation = [invocation .model_copy (deep = True )]
566- mock_eval_case .conversation_scenario = None
567- mock_eval_case .session_input = None
568- mock_eval_sets_manager .get_eval_case .return_value = mock_eval_case
569-
570- _ , result = await eval_service ._evaluate_single_inference_result (
571- inference_result = inference_result , evaluate_config = evaluate_config
572- )
573-
574- assert isinstance (result , EvalCaseResult )
575- assert result .eval_id == "case1"
576- assert result .final_eval_status == EvalStatus .NOT_EVALUATED
577- assert result .overall_eval_metric_results == []
578- assert result .eval_metric_result_per_invocation == []
579- mock_eval_sets_manager .get_eval_case .assert_called_once_with (
580- app_name = "test_app" , eval_set_id = "test_eval_set" , eval_case_id = "case1"
581- )
582-
583-
584421@pytest .mark .asyncio
585422async def test_evaluate_single_inference_result_for_conversation_scenario (
586423 eval_service , mock_eval_sets_manager , mocker
0 commit comments