From 582b1edc1ef603cfb0cd2d421bf52a7e4e18add9 Mon Sep 17 00:00:00 2001 From: A Vertex SDK engineer Date: Thu, 2 Apr 2026 21:45:06 -0700 Subject: [PATCH] feat: GenAI Client(evals) - add user-facing generate_loss_clusters with LRO polling and replay tests PiperOrigin-RevId: 893874547 --- .../replays/test_generate_loss_clusters.py | 76 +++++ tests/unit/vertexai/genai/test_evals.py | 311 +++++++++++++++++- vertexai/_genai/_evals_common.py | 18 +- vertexai/_genai/_evals_utils.py | 110 ++++++- vertexai/_genai/_transformers.py | 149 ++++++++- vertexai/_genai/evals.py | 94 ++++++ vertexai/_genai/types/common.py | 11 + 7 files changed, 755 insertions(+), 14 deletions(-) create mode 100644 tests/unit/vertexai/genai/replays/test_generate_loss_clusters.py diff --git a/tests/unit/vertexai/genai/replays/test_generate_loss_clusters.py b/tests/unit/vertexai/genai/replays/test_generate_loss_clusters.py new file mode 100644 index 0000000000..f585feb9d5 --- /dev/null +++ b/tests/unit/vertexai/genai/replays/test_generate_loss_clusters.py @@ -0,0 +1,76 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# pylint: disable=protected-access,bad-continuation,missing-function-docstring + +from tests.unit.vertexai.genai.replays import pytest_helper +from vertexai import types +import pytest + + +def test_gen_loss_clusters(client): + """Tests that generate_loss_clusters() returns GenerateLossClustersResponse.""" + eval_result = types.EvaluationResult() + response = client.evals.generate_loss_clusters( + eval_result=eval_result, + config=types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="travel-agent", + ), + ) + assert isinstance(response, types.GenerateLossClustersResponse) + assert len(response.results) == 1 + result = response.results[0] + assert result.config.metric == "multi_turn_task_success_v1" + assert result.config.candidate == "travel-agent" + assert len(result.clusters) == 2 + assert result.clusters[0].cluster_id == "cluster-1" + assert result.clusters[0].taxonomy_entry.l1_category == "Tool Calling" + assert ( + result.clusters[0].taxonomy_entry.l2_category == "Missing Tool Invocation" + ) + assert result.clusters[0].item_count == 3 + assert result.clusters[1].cluster_id == "cluster-2" + assert result.clusters[1].taxonomy_entry.l1_category == "Hallucination" + assert result.clusters[1].item_count == 2 + + +pytest_plugins = ("pytest_asyncio",) + + +@pytest.mark.asyncio +async def test_gen_loss_clusters_async(client): + """Tests that generate_loss_clusters() async returns GenerateLossClustersResponse.""" + eval_result = types.EvaluationResult() + response = await client.aio.evals.generate_loss_clusters( + eval_result=eval_result, + config=types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="travel-agent", + ), + ) + assert isinstance(response, types.GenerateLossClustersResponse) + assert len(response.results) == 1 + result = response.results[0] + assert result.config.metric == "multi_turn_task_success_v1" + assert len(result.clusters) == 2 + assert result.clusters[0].cluster_id == "cluster-1" + assert result.clusters[1].cluster_id == "cluster-2" + + +pytestmark = pytest_helper.setup( + file=__file__, + globals_for_file=globals(), + test_method="evals.generate_loss_clusters", +) diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py index 573f7f04d4..902769c372 100644 --- a/tests/unit/vertexai/genai/test_evals.py +++ b/tests/unit/vertexai/genai/test_evals.py @@ -29,6 +29,7 @@ from google.cloud.aiplatform import initializer as aiplatform_initializer from vertexai import _genai from vertexai._genai import _evals_data_converters +from vertexai._genai import _evals_utils from vertexai._genai import _evals_metric_handlers from vertexai._genai import _evals_visualization from vertexai._genai import _evals_metric_loaders @@ -264,6 +265,296 @@ def test_t_inline_results(self): assert payload[0]["candidate_results"][0]["candidate"] == "gemini-pro" assert payload[0]["candidate_results"][0]["score"] == 0.0 + def test_t_inline_results_sanitizes_agent_data(self): + """Tests that t_inline_results strips SDK-only fields from agent_data.""" + eval_result = common_types.EvaluationResult( + eval_case_results=[ + common_types.EvalCaseResult( + eval_case_index=0, + response_candidate_results=[ + common_types.ResponseCandidateResult( + response_index=0, + metric_results={ + "multi_turn_task_success_v1": common_types.EvalCaseMetricResult( + score=0.0, + explanation="Failed", + ) + }, + ) + ], + ) + ], + evaluation_dataset=[ + common_types.EvaluationDataset( + eval_cases=[ + common_types.EvalCase( + agent_data=vertexai_genai_types.evals.AgentData( + turns=[ + vertexai_genai_types.evals.ConversationTurn( + turn_index=0, + turn_id="turn_0", + events=[ + vertexai_genai_types.evals.AgentEvent( + author="user", + content=genai_types.Content( + role="user", + parts=[genai_types.Part(text="Hello")], + ), + ), + vertexai_genai_types.evals.AgentEvent( + author="model", + content=genai_types.Content( + role="model", + parts=[ + genai_types.Part( + function_call=genai_types.FunctionCall( + name="search", + args={"q": "test"}, + ) + ) + ], + ), + ), + vertexai_genai_types.evals.AgentEvent( + author="model", + content=genai_types.Content( + role="model", + parts=[ + genai_types.Part( + function_response=genai_types.FunctionResponse( + name="search", + response={"result": "ok"}, + ) + ) + ], + ), + ), + ], + ) + ] + ) + ) + ] + ) + ], + metadata=common_types.EvaluationRunMetadata( + candidate_names=["travel-agent"] + ), + ) + + payload = _transformers.t_inline_results([eval_result]) + assert len(payload) == 1 + + agent_data = payload[0]["request"]["prompt"]["agent_data"] + assert "turns" in agent_data + events = agent_data["turns"][0]["events"] + assert len(events) == 3 + + # Check text part is preserved + text_part = events[0]["content"]["parts"][0] + assert "text" in text_part + assert text_part["text"] == "Hello" + + # Check function_call is preserved (API-recognized field) + fc_part = events[1]["content"]["parts"][0] + assert "function_call" in fc_part + assert fc_part["function_call"]["name"] == "search" + # SDK-only fields must NOT be present + assert "tool_call" not in fc_part + assert "tool_response" not in fc_part + assert "part_metadata" not in fc_part + + # Check function_response is preserved but will_continue is stripped + fr_part = events[2]["content"]["parts"][0] + assert "function_response" in fr_part + assert fr_part["function_response"]["name"] == "search" + assert "will_continue" not in fr_part["function_response"] + + def test_sanitize_agent_data_from_dataframe(self): + """Tests sanitization when agent_data comes from DataFrame (dict form).""" + # Simulate agent_data stored in DataFrame with SDK-only fields + raw_agent_data = { + "turns": [{ + "turn_index": 0, + "turn_id": "turn_0", + "events": [{ + "author": "model", + "content": { + "role": "model", + "parts": [{ + "function_call": {"name": "find_flights", "args": {"origin": "NYC"}}, + "tool_call": None, + "tool_response": None, + "part_metadata": None, + }], + }, + }, { + "author": "model", + "content": { + "role": "model", + "parts": [{ + "function_response": { + "name": "find_flights", + "response": {"flights": []}, + "will_continue": False, + "scheduling": None, + }, + }], + }, + }], + }], + } + + sanitized = _transformers._sanitize_agent_data(raw_agent_data) + + parts_0 = sanitized["turns"][0]["events"][0]["content"]["parts"][0] + assert "function_call" in parts_0 + assert "tool_call" not in parts_0 + assert "tool_response" not in parts_0 + assert "part_metadata" not in parts_0 + + parts_1 = sanitized["turns"][0]["events"][1]["content"]["parts"][0] + assert "function_response" in parts_1 + assert parts_1["function_response"]["name"] == "find_flights" + assert "will_continue" not in parts_1["function_response"] + assert "scheduling" not in parts_1["function_response"] + + def test_sanitize_agent_data_skips_error_payload(self): + """Tests that error payloads from failed agent runs are stripped.""" + error_data = {"error": "Multi-turn agent run with user simulation failed"} + sanitized = _transformers._sanitize_agent_data(error_data) + assert "error" not in sanitized + assert sanitized == {} + + def test_t_inline_results_skips_error_agent_data_in_df(self): + """Tests that t_inline_results skips error agent_data from DataFrame.""" + error_json = json.dumps({"error": "Agent run failed"}) + df = pd.DataFrame({ + "prompt": ["test"], + "agent_data": [error_json], + }) + eval_result = common_types.EvaluationResult( + eval_case_results=[ + common_types.EvalCaseResult( + eval_case_index=0, + response_candidate_results=[ + common_types.ResponseCandidateResult( + response_index=0, + metric_results={ + "metric_v1": common_types.EvalCaseMetricResult( + score=0.0, + ) + }, + ) + ], + ) + ], + evaluation_dataset=[ + common_types.EvaluationDataset(eval_dataset_df=df) + ], + metadata=common_types.EvaluationRunMetadata( + candidate_names=["agent"] + ), + ) + payload = _transformers.t_inline_results([eval_result]) + assert len(payload) == 1 + # The prompt should have no agent_data (error was skipped) + assert "agent_data" not in payload[0]["request"]["prompt"] + + +class TestLossAnalysis: + """Unit tests for loss analysis types and visualization.""" + + def test_response_structure(self): + response = common_types.GenerateLossClustersResponse( + analysis_time="2026-04-01T10:00:00Z", + results=[ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="multi_turn_task_success_v1", + candidate="travel-agent", + ), + analysis_time="2026-04-01T10:00:00Z", + clusters=[ + common_types.LossCluster( + cluster_id="cluster-1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Tool Calling", + l2_category="Missing Tool Invocation", + description="The agent failed to invoke a required tool.", + ), + item_count=3, + ), + common_types.LossCluster( + cluster_id="cluster-2", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Hallucination", + l2_category="Hallucination of Action", + description="Verbally confirmed action without tool.", + ), + item_count=2, + ), + ], + ) + ], + ) + assert len(response.results) == 1 + assert response.analysis_time == "2026-04-01T10:00:00Z" + result = response.results[0] + assert result.config.metric == "multi_turn_task_success_v1" + assert len(result.clusters) == 2 + assert result.clusters[0].cluster_id == "cluster-1" + assert result.clusters[0].item_count == 3 + assert result.clusters[1].cluster_id == "cluster-2" + + def test_response_show_with_results(self, capsys): + response = common_types.GenerateLossClustersResponse( + results=[ + common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="test_metric", + candidate="test-candidate", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="Cat1", + l2_category="SubCat1", + ), + item_count=5, + ), + ], + ) + ], + ) + response.show() + captured = capsys.readouterr() + assert "test_metric" in captured.out + assert "c1" in captured.out + + def test_loss_analysis_result_show(self, capsys): + result = common_types.LossAnalysisResult( + config=common_types.LossAnalysisConfig( + metric="test_metric", + candidate="test-candidate", + ), + clusters=[ + common_types.LossCluster( + cluster_id="c1", + taxonomy_entry=common_types.LossTaxonomyEntry( + l1_category="DirectCat", + l2_category="DirectSubCat", + ), + item_count=7, + ), + ], + ) + result.show() + captured = capsys.readouterr() + assert "test_metric" in captured.out + assert "c1" in captured.out + class TestEvals: """Unit tests for the GenAI client.""" @@ -1570,7 +1861,7 @@ def test_run_inference_with_local_agent( mock_runner_instance = mock_runner.return_value stream_run_return_value_1 = [ mock.Mock( - model_dump=lambda: { + model_dump=lambda **kwargs: { "id": "1", "content": {"parts": [{"text": "intermediate1"}]}, "timestamp": 123, @@ -1578,7 +1869,7 @@ def test_run_inference_with_local_agent( } ), mock.Mock( - model_dump=lambda: { + model_dump=lambda **kwargs: { "id": "2", "content": {"parts": [{"text": "agent response"}]}, "timestamp": 124, @@ -1588,7 +1879,7 @@ def test_run_inference_with_local_agent( ] stream_run_return_value_2 = [ mock.Mock( - model_dump=lambda: { + model_dump=lambda **kwargs: { "id": "3", "content": {"parts": [{"text": "intermediate2"}]}, "timestamp": 125, @@ -1596,7 +1887,7 @@ def test_run_inference_with_local_agent( } ), mock.Mock( - model_dump=lambda: { + model_dump=lambda **kwargs: { "id": "4", "content": {"parts": [{"text": "agent response 2"}]}, "timestamp": 126, @@ -2314,9 +2605,15 @@ async def test_run_adk_user_simulation_with_intermediate_events( turn["events"][3]["content"]["parts"][0]["text"] == "There are no laptops matching your search." ) - mock_invocation.user_content.model_dump.assert_called_with(mode="json") - mock_event_1.content.model_dump.assert_called_with(mode="json") - mock_invocation.final_response.model_dump.assert_called_with(mode="json") + mock_invocation.user_content.model_dump.assert_called_with( + mode="json", exclude_none=True + ) + mock_event_1.content.model_dump.assert_called_with( + mode="json", exclude_none=True + ) + mock_invocation.final_response.model_dump.assert_called_with( + mode="json", exclude_none=True + ) @mock.patch.object(_evals_common, "_run_agent") def test_run_agent_internal_malformed_event(self, mock_run_agent): diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index a62fe2083b..e724630033 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -1014,7 +1014,9 @@ async def _run_adk_user_simulation( events.append( { "author": "user", - "content": invocation.user_content.model_dump(mode="json"), + "content": invocation.user_content.model_dump( + mode="json", exclude_none=True + ), "event_time": datetime.datetime.fromtimestamp( invocation.creation_timestamp, tz=datetime.timezone.utc ), @@ -1030,7 +1032,9 @@ async def _run_adk_user_simulation( { "author": ie.author, "content": ( - ie.content.model_dump(mode="json") + ie.content.model_dump( + mode="json", exclude_none=True + ) if ie.content else None ), @@ -1044,7 +1048,9 @@ async def _run_adk_user_simulation( events.append( { "author": "tool_call", - "content": tool_call.model_dump(mode="json"), + "content": tool_call.model_dump( + mode="json", exclude_none=True + ), "event_time": datetime.datetime.fromtimestamp( invocation.creation_timestamp, tz=datetime.timezone.utc ), @@ -1055,7 +1061,9 @@ async def _run_adk_user_simulation( events.append( { "author": "agent", - "content": invocation.final_response.model_dump(mode="json"), + "content": invocation.final_response.model_dump( + mode="json", exclude_none=True + ), "event_time": datetime.datetime.fromtimestamp( invocation.creation_timestamp, tz=datetime.timezone.utc ), @@ -2021,7 +2029,7 @@ async def _execute_local_agent_run_with_retry_async( new_message=new_message_content, ): if event: - event = event.model_dump() + event = event.model_dump(exclude_none=True) if event and CONTENT in event and PARTS in event[CONTENT]: events.append(event) return events diff --git a/vertexai/_genai/_evals_utils.py b/vertexai/_genai/_evals_utils.py index 9d4dd4fc71..9ddeb4ae97 100644 --- a/vertexai/_genai/_evals_utils.py +++ b/vertexai/_genai/_evals_utils.py @@ -15,9 +15,11 @@ """Utility functions for evals.""" import abc +import asyncio +import json import logging import os -import json +import time from typing import Any, Optional, Union from google.genai._api_client import BaseApiClient @@ -366,6 +368,112 @@ def _postprocess_user_scenarios_response( ) +def _display_loss_analysis_result( + result: types.LossAnalysisResult, +) -> None: + """Displays a LossAnalysisResult as a formatted pandas DataFrame.""" + metric = result.config.metric if result.config else None + candidate = result.config.candidate if result.config else None + rows = [] + for cluster in result.clusters or []: + entry = cluster.taxonomy_entry + row = { + "metric": metric, + "candidate": candidate, + "cluster_id": cluster.cluster_id, + "l1_category": entry.l1_category if entry else None, + "l2_category": entry.l2_category if entry else None, + "description": entry.description if entry else None, + "item_count": cluster.item_count, + } + rows.append(row) + + if not rows: + logger.info("No loss clusters found.") + return + + df = pd.DataFrame(rows) + try: + from IPython.display import display # pylint: disable=g-import-not-at-top + + display(df) + except ImportError: + print(df.to_string()) # pylint: disable=print-function + + + + + +def _poll_operation( + api_client: BaseApiClient, + operation: types.GenerateLossClustersOperation, + poll_interval_seconds: float = 5.0, +) -> types.GenerateLossClustersOperation: + """Polls a long-running operation until completion. + + Args: + api_client: The API client to use for polling. + operation: The initial operation returned from the API call. + poll_interval_seconds: Time between polls. + + Returns: + The completed operation. + """ + if operation.done: + return operation + start_time = time.time() + while True: + response = api_client.request("get", operation.name, {}, None) + response_dict = {} if not response.body else json.loads(response.body) + polled = types.GenerateLossClustersOperation._from_response( + response=response_dict, kwargs={} + ) + if polled.done: + return polled + elapsed = int(time.time() - start_time) + logger.info( + "Loss analysis operation still running... Elapsed time: %d seconds", + elapsed, + ) + time.sleep(poll_interval_seconds) + + +async def _poll_operation_async( + api_client: BaseApiClient, + operation: types.GenerateLossClustersOperation, + poll_interval_seconds: float = 5.0, +) -> types.GenerateLossClustersOperation: + """Polls a long-running operation until completion (async). + + Args: + api_client: The API client to use for polling. + operation: The initial operation returned from the API call. + poll_interval_seconds: Time between polls. + + Returns: + The completed operation. + """ + if operation.done: + return operation + start_time = time.time() + while True: + response = await api_client.async_request( + "get", operation.name, {}, None + ) + response_dict = {} if not response.body else json.loads(response.body) + polled = types.GenerateLossClustersOperation._from_response( + response=response_dict, kwargs={} + ) + if polled.done: + return polled + elapsed = int(time.time() - start_time) + logger.info( + "Loss analysis operation still running... Elapsed time: %d seconds", + elapsed, + ) + await asyncio.sleep(poll_interval_seconds) + + def _validate_dataset_agent_data( dataset: types.EvaluationDataset, inference_configs: Optional[dict[str, Any]] = None, diff --git a/vertexai/_genai/_transformers.py b/vertexai/_genai/_transformers.py index 34c471ef85..e694966b8e 100644 --- a/vertexai/_genai/_transformers.py +++ b/vertexai/_genai/_transformers.py @@ -14,6 +14,7 @@ # """Transformers module for Vertex addons.""" +import json import re from typing import Any @@ -260,6 +261,118 @@ def t_metric_for_registry( return metric_payload_item +_ALLOWED_PART_FIELDS = frozenset({ + "text", "inline_data", "file_data", "function_call", "function_response", + "video_metadata", "thought", "thought_signature", "code_execution_result", + "executable_code", "media_resolution", +}) + + +def _sanitize_agent_data(agent_data: dict[str, Any]) -> dict[str, Any]: + """Strips SDK-only fields from agent_data so the API accepts the payload. + + The SDK's AgentData model may contain fields like 'tool_call', + 'tool_response', 'part_metadata', and 'will_continue' that don't exist + in the API's AgentData / Content proto. This function recursively removes + them from content parts and keeps only API-recognized top-level fields. + """ + if not isinstance(agent_data, dict): + return agent_data + + sanitized = {} + for key, value in agent_data.items(): + if key == "turns" and isinstance(value, list): + sanitized["turns"] = [ + _sanitize_turn(t) for t in value if isinstance(t, dict) + ] + elif key == "agents" and isinstance(value, dict): + sanitized["agents"] = { + k: _sanitize_agent_config(v) if isinstance(v, dict) else v + for k, v in value.items() + } + # Skip unknown top-level fields (e.g. "error" from failed agent runs). + return sanitized + + +def _sanitize_agent_config(config: dict[str, Any]) -> dict[str, Any]: + """Sanitizes an AgentConfig dict, keeping only API-known fields.""" + allowed = {"agent_id", "agent_type", "description", "instruction", "tools", "sub_agents"} + return {k: v for k, v in config.items() if k in allowed} + + +def _sanitize_turn(turn: dict[str, Any]) -> dict[str, Any]: + """Sanitizes a ConversationTurn dict.""" + sanitized = {} + for key, value in turn.items(): + if key == "events" and isinstance(value, list): + sanitized["events"] = [ + _sanitize_event(e) for e in value if isinstance(e, dict) + ] + else: + sanitized[key] = value + return sanitized + + +def _sanitize_event(event: dict[str, Any]) -> dict[str, Any]: + """Sanitizes an AgentEvent dict.""" + sanitized = {} + for key, value in event.items(): + if key == "content" and isinstance(value, dict): + sanitized["content"] = _sanitize_content(value) + elif key in ("author", "event_time", "state_delta", "active_tools"): + sanitized[key] = value + # Skip unknown event-level fields. + return sanitized + + +def _sanitize_content(content: dict[str, Any]) -> dict[str, Any]: + """Sanitizes a Content dict, stripping unknown fields from parts.""" + sanitized = {} + for key, value in content.items(): + if key == "parts" and isinstance(value, list): + sanitized["parts"] = [ + _sanitize_part(p) for p in value if isinstance(p, dict) + ] + elif key == "role": + sanitized["role"] = value + return sanitized + + +def _sanitize_part(part: dict[str, Any]) -> dict[str, Any]: + """Keeps only API-recognized fields in a Part dict.""" + sanitized = {} + for key, value in part.items(): + if key in _ALLOWED_PART_FIELDS: + if key == "function_response" and isinstance(value, dict): + # Strip unknown sub-fields like 'will_continue'. + sanitized[key] = { + k: v for k, v in value.items() + if k in ("name", "id", "response") + } + else: + sanitized[key] = value + return sanitized + + +def _extract_agent_data_from_df( + eval_dataset: Any, + case_idx: int, +) -> Any: + """Extracts agent_data from a DataFrame-based EvaluationDataset by row index.""" + if not eval_dataset: + return None + ds = eval_dataset[0] if isinstance(eval_dataset, list) else eval_dataset + df = getv(ds, ["eval_dataset_df"]) + if df is None or not hasattr(df, "iloc"): + return None + if case_idx < 0 or case_idx >= len(df): + return None + row = df.iloc[case_idx] + if "agent_data" not in row or row["agent_data"] is None: + return None + return row["agent_data"] + + def t_inline_results( eval_results: list[Any], ) -> list[dict[str, Any]]: @@ -292,7 +405,13 @@ def t_inline_results( if agent_data: if hasattr(agent_data, "model_dump"): - prompt_payload["agent_data"] = agent_data.model_dump() + prompt_payload["agent_data"] = _sanitize_agent_data( + agent_data.model_dump() + ) + elif isinstance(agent_data, dict): + prompt_payload["agent_data"] = _sanitize_agent_data( + agent_data + ) else: prompt_payload["agent_data"] = agent_data elif prompt: @@ -302,6 +421,34 @@ def t_inline_results( if text: prompt_payload["text"] = str(text) + # Fallback: extract agent_data from the DataFrame when eval_cases + # are not available (e.g., run_inference -> evaluate flow). + if not prompt_payload: + df_agent_data = _extract_agent_data_from_df( + eval_dataset, case_idx + ) + if df_agent_data is not None: + if hasattr(df_agent_data, "model_dump"): + prompt_payload["agent_data"] = _sanitize_agent_data( + df_agent_data.model_dump() + ) + elif isinstance(df_agent_data, str): + try: + parsed = json.loads(df_agent_data) + if isinstance(parsed, dict) and "error" in parsed: + pass # Skip error payloads from failed agent runs. + else: + prompt_payload["agent_data"] = ( + _sanitize_agent_data(parsed) + ) + except (json.JSONDecodeError, ValueError): + pass + elif isinstance(df_agent_data, dict): + if "error" not in df_agent_data: + prompt_payload["agent_data"] = _sanitize_agent_data( + df_agent_data + ) + cand_results = getv(case_result, ["response_candidate_results"]) or [] for resp_cand_result in cand_results: resp_idx = getv(resp_cand_result, ["response_index"]) or 0 diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index 0adf80cd6e..eb8ffe4b5f 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -2454,6 +2454,53 @@ def generate_conversation_scenarios( ) return _evals_utils._postprocess_user_scenarios_response(response) + @_common.experimental_warning( + "The Vertex SDK GenAI evals.generate_loss_clusters module is experimental, " + "and may change in future versions." + ) + def generate_loss_clusters( + self, + *, + eval_result: types.EvaluationResult, + config: types.LossAnalysisConfigOrDict, + ) -> types.GenerateLossClustersResponse: + """Generates loss clusters from evaluation results. + + Analyzes "Pass/Fail" signals from rubric-based autoraters and groups + them into semantic "Loss Patterns" (e.g., "Hallucination of Action"). + + This method calls the GenerateLossClusters LRO and polls until + completion, returning the results directly. + + Args: + eval_result: The EvaluationResult object returned from + client.evals.evaluate(). + config: Configuration for the loss analysis, specifying the + metric and candidate to analyze. Can be a LossAnalysisConfig + object or a dict. + + Returns: + A GenerateLossClustersResponse containing the analysis results. + Call .show() to visualize, or access .results for individual + LossAnalysisResult objects (each with their own .show()). + """ + parsed_config = ( + types.LossAnalysisConfig.model_validate(config) + if isinstance(config, dict) + else config + ) + operation = self._generate_loss_clusters( + inline_results=[eval_result], + configs=[parsed_config], + ) + completed = _evals_utils._poll_operation( + api_client=self._api_client, + operation=operation, + ) + if completed.error: + raise RuntimeError(f"Loss analysis operation failed: {completed.error}") + return completed.response + @_common.experimental_warning( "The Vertex SDK GenAI evals.create_evaluation_metric method is experimental, " "and may change in future versions." @@ -3731,6 +3778,53 @@ async def generate_conversation_scenarios( ) return _evals_utils._postprocess_user_scenarios_response(response) + @_common.experimental_warning( + "The Vertex SDK GenAI evals.generate_loss_clusters module is experimental, " + "and may change in future versions." + ) + async def generate_loss_clusters( + self, + *, + eval_result: types.EvaluationResult, + config: types.LossAnalysisConfigOrDict, + ) -> types.GenerateLossClustersResponse: + """Generates loss clusters from evaluation results. + + Analyzes "Pass/Fail" signals from rubric-based autoraters and groups + them into semantic "Loss Patterns" (e.g., "Hallucination of Action"). + + This method calls the GenerateLossClusters LRO and polls until + completion, returning the results directly. + + Args: + eval_result: The EvaluationResult object returned from + client.evals.evaluate(). + config: Configuration for the loss analysis, specifying the + metric and candidate to analyze. Can be a LossAnalysisConfig + object or a dict. + + Returns: + A GenerateLossClustersResponse containing the analysis results. + Call .show() to visualize, or access .results for individual + LossAnalysisResult objects (each with their own .show()). + """ + parsed_config = ( + types.LossAnalysisConfig.model_validate(config) + if isinstance(config, dict) + else config + ) + operation = await self._generate_loss_clusters( + inline_results=[eval_result], + configs=[parsed_config], + ) + completed = await _evals_utils._poll_operation_async( + api_client=self._api_client, + operation=operation, + ) + if completed.error: + raise RuntimeError(f"Loss analysis operation failed: {completed.error}") + return completed.response + @_common.experimental_warning( "The Vertex SDK GenAI evals.create_evaluation_metric module is experimental, " "and may change in future versions." diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index 2d9f4b50ce..d3616a460b 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -4920,6 +4920,12 @@ class LossAnalysisResult(_common.BaseModel): default=None, description="""The list of identified loss clusters.""" ) + def show(self) -> None: + """Shows the loss analysis result as a formatted pandas DataFrame.""" + from .. import _evals_utils + + _evals_utils._display_loss_analysis_result(self) + class LossAnalysisResultDict(TypedDict, total=False): """The top-level result for loss analysis.""" @@ -4948,6 +4954,11 @@ class GenerateLossClustersResponse(_common.BaseModel): description="""The analysis results, one per config provided in the request.""", ) + def show(self) -> None: + """Shows all loss analysis results.""" + for result in self.results or []: + result.show() + class GenerateLossClustersResponseDict(TypedDict, total=False): """Response message for EvaluationAnalyticsService.GenerateLossClusters."""