Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# pylint: disable=protected-access,bad-continuation,missing-function-docstring

from tests.unit.vertexai.genai.replays import pytest_helper
from vertexai import types
import pytest


def test_gen_loss_clusters(client):
"""Tests that generate_loss_clusters() returns GenerateLossClustersResponse."""
eval_result = types.EvaluationResult()
response = client.evals.generate_loss_clusters(
eval_result=eval_result,
config=types.LossAnalysisConfig(
metric="multi_turn_task_success_v1",
candidate="travel-agent",
),
)
assert isinstance(response, types.GenerateLossClustersResponse)
assert len(response.results) == 1
result = response.results[0]
assert result.config.metric == "multi_turn_task_success_v1"
assert result.config.candidate == "travel-agent"
assert len(result.clusters) == 2
assert result.clusters[0].cluster_id == "cluster-1"
assert result.clusters[0].taxonomy_entry.l1_category == "Tool Calling"
assert (
result.clusters[0].taxonomy_entry.l2_category == "Missing Tool Invocation"
)
assert result.clusters[0].item_count == 3
assert result.clusters[1].cluster_id == "cluster-2"
assert result.clusters[1].taxonomy_entry.l1_category == "Hallucination"
assert result.clusters[1].item_count == 2


pytest_plugins = ("pytest_asyncio",)


@pytest.mark.asyncio
async def test_gen_loss_clusters_async(client):
"""Tests that generate_loss_clusters() async returns GenerateLossClustersResponse."""
eval_result = types.EvaluationResult()
response = await client.aio.evals.generate_loss_clusters(
eval_result=eval_result,
config=types.LossAnalysisConfig(
metric="multi_turn_task_success_v1",
candidate="travel-agent",
),
)
assert isinstance(response, types.GenerateLossClustersResponse)
assert len(response.results) == 1
result = response.results[0]
assert result.config.metric == "multi_turn_task_success_v1"
assert len(result.clusters) == 2
assert result.clusters[0].cluster_id == "cluster-1"
assert result.clusters[1].cluster_id == "cluster-2"


pytestmark = pytest_helper.setup(
file=__file__,
globals_for_file=globals(),
test_method="evals.generate_loss_clusters",
)
277 changes: 277 additions & 0 deletions tests/unit/vertexai/genai/test_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from google.cloud.aiplatform import initializer as aiplatform_initializer
from vertexai import _genai
from vertexai._genai import _evals_data_converters
from vertexai._genai import _evals_utils
from vertexai._genai import _evals_metric_handlers
from vertexai._genai import _evals_visualization
from vertexai._genai import _evals_metric_loaders
Expand Down Expand Up @@ -265,6 +266,282 @@ def test_t_inline_results(self):
assert payload[0]["candidate_results"][0]["score"] == 0.0


class TestLossAnalysis:
"""Unit tests for loss analysis types and visualization."""

def test_response_structure(self):
response = common_types.GenerateLossClustersResponse(
analysis_time="2026-04-01T10:00:00Z",
results=[
common_types.LossAnalysisResult(
config=common_types.LossAnalysisConfig(
metric="multi_turn_task_success_v1",
candidate="travel-agent",
),
analysis_time="2026-04-01T10:00:00Z",
clusters=[
common_types.LossCluster(
cluster_id="cluster-1",
taxonomy_entry=common_types.LossTaxonomyEntry(
l1_category="Tool Calling",
l2_category="Missing Tool Invocation",
description="The agent failed to invoke a required tool.",
),
item_count=3,
),
common_types.LossCluster(
cluster_id="cluster-2",
taxonomy_entry=common_types.LossTaxonomyEntry(
l1_category="Hallucination",
l2_category="Hallucination of Action",
description="Verbally confirmed action without tool.",
),
item_count=2,
),
],
)
],
)
assert len(response.results) == 1
assert response.analysis_time == "2026-04-01T10:00:00Z"
result = response.results[0]
assert result.config.metric == "multi_turn_task_success_v1"
assert len(result.clusters) == 2
assert result.clusters[0].cluster_id == "cluster-1"
assert result.clusters[0].item_count == 3
assert result.clusters[1].cluster_id == "cluster-2"

def test_response_show_with_results(self, capsys):
response = common_types.GenerateLossClustersResponse(
results=[
common_types.LossAnalysisResult(
config=common_types.LossAnalysisConfig(
metric="test_metric",
candidate="test-candidate",
),
clusters=[
common_types.LossCluster(
cluster_id="c1",
taxonomy_entry=common_types.LossTaxonomyEntry(
l1_category="Cat1",
l2_category="SubCat1",
),
item_count=5,
),
],
)
],
)
response.show()
captured = capsys.readouterr()
assert "test_metric" in captured.out
assert "c1" in captured.out

def test_loss_analysis_result_show(self, capsys):
result = common_types.LossAnalysisResult(
config=common_types.LossAnalysisConfig(
metric="test_metric",
candidate="test-candidate",
),
clusters=[
common_types.LossCluster(
cluster_id="c1",
taxonomy_entry=common_types.LossTaxonomyEntry(
l1_category="DirectCat",
l2_category="DirectSubCat",
),
item_count=7,
),
],
)
result.show()
captured = capsys.readouterr()
assert "test_metric" in captured.out
assert "c1" in captured.out


def _make_eval_result(
metrics=None,
candidate_names=None,
):
"""Helper to create an EvaluationResult with the given metrics and candidates."""
metrics = metrics or ["task_success_v1"]
candidate_names = candidate_names or ["agent-1"]

metric_results = {}
for m in metrics:
metric_results[m] = common_types.EvalCaseMetricResult(metric_name=m)

eval_case_results = [
common_types.EvalCaseResult(
eval_case_index=0,
response_candidate_results=[
common_types.ResponseCandidateResult(
response_index=0,
metric_results=metric_results,
)
],
)
]
metadata = common_types.EvaluationRunMetadata(
candidate_names=candidate_names,
)
return common_types.EvaluationResult(
eval_case_results=eval_case_results,
metadata=metadata,
)


class TestResolveMetricName:
"""Unit tests for _resolve_metric_name."""

def test_none_returns_none(self):
assert _evals_utils._resolve_metric_name(None) is None

def test_string_passes_through(self):
assert _evals_utils._resolve_metric_name("task_success_v1") == "task_success_v1"

def test_metric_object_extracts_name(self):
metric = common_types.Metric(name="multi_turn_task_success_v1")
assert (
_evals_utils._resolve_metric_name(metric)
== "multi_turn_task_success_v1"
)

def test_object_with_name_attr(self):
"""Tests that any object with a .name attribute works (e.g., LazyLoadedPrebuiltMetric)."""

class FakeMetric:
name = "tool_use_quality_v1"

assert _evals_utils._resolve_metric_name(FakeMetric()) == "tool_use_quality_v1"

def test_lazy_loaded_prebuilt_metric_resolves_versioned_name(self):
"""Tests that LazyLoadedPrebuiltMetric resolves to the versioned API spec name."""

class FakeLazyMetric:
name = "MULTI_TURN_TASK_SUCCESS"

def _get_api_metric_spec_name(self):
return "multi_turn_task_success_v1"

assert (
_evals_utils._resolve_metric_name(FakeLazyMetric())
== "multi_turn_task_success_v1"
)

def test_lazy_loaded_prebuilt_metric_falls_back_to_name(self):
"""Tests fallback to .name when _get_api_metric_spec_name returns None."""

class FakeLazyMetricNoSpec:
name = "CUSTOM_METRIC"

def _get_api_metric_spec_name(self):
return None

assert (
_evals_utils._resolve_metric_name(FakeLazyMetricNoSpec())
== "CUSTOM_METRIC"
)


class TestResolveLossAnalysisConfig:
"""Unit tests for _resolve_loss_analysis_config."""

def test_auto_infer_single_metric_and_candidate(self):
eval_result = _make_eval_result(
metrics=["task_success_v1"], candidate_names=["agent-1"]
)
resolved = _evals_utils._resolve_loss_analysis_config(
eval_result=eval_result
)
assert resolved.metric == "task_success_v1"
assert resolved.candidate == "agent-1"

def test_explicit_metric_and_candidate(self):
eval_result = _make_eval_result(
metrics=["m1", "m2"], candidate_names=["c1", "c2"]
)
resolved = _evals_utils._resolve_loss_analysis_config(
eval_result=eval_result, metric="m1", candidate="c2"
)
assert resolved.metric == "m1"
assert resolved.candidate == "c2"

def test_config_provides_metric_and_candidate(self):
eval_result = _make_eval_result(
metrics=["m1"], candidate_names=["c1"]
)
config = common_types.LossAnalysisConfig(
metric="m1", candidate="c1", predefined_taxonomy="my_taxonomy"
)
resolved = _evals_utils._resolve_loss_analysis_config(
eval_result=eval_result, config=config
)
assert resolved.metric == "m1"
assert resolved.candidate == "c1"
assert resolved.predefined_taxonomy == "my_taxonomy"

def test_explicit_args_override_config(self):
eval_result = _make_eval_result(
metrics=["m1", "m2"], candidate_names=["c1", "c2"]
)
config = common_types.LossAnalysisConfig(metric="m1", candidate="c1")
resolved = _evals_utils._resolve_loss_analysis_config(
eval_result=eval_result, config=config, metric="m2", candidate="c2"
)
assert resolved.metric == "m2"
assert resolved.candidate == "c2"

def test_error_multiple_metrics_no_explicit(self):
eval_result = _make_eval_result(
metrics=["m1", "m2"], candidate_names=["c1"]
)
with pytest.raises(ValueError, match="multiple metrics"):
_evals_utils._resolve_loss_analysis_config(eval_result=eval_result)

def test_error_multiple_candidates_no_explicit(self):
eval_result = _make_eval_result(
metrics=["m1"], candidate_names=["c1", "c2"]
)
with pytest.raises(ValueError, match="multiple candidates"):
_evals_utils._resolve_loss_analysis_config(eval_result=eval_result)

def test_error_invalid_metric(self):
eval_result = _make_eval_result(
metrics=["m1"], candidate_names=["c1"]
)
with pytest.raises(ValueError, match="not found in eval_result"):
_evals_utils._resolve_loss_analysis_config(
eval_result=eval_result, metric="nonexistent"
)

def test_error_invalid_candidate(self):
eval_result = _make_eval_result(
metrics=["m1"], candidate_names=["c1"]
)
with pytest.raises(ValueError, match="not found in eval_result"):
_evals_utils._resolve_loss_analysis_config(
eval_result=eval_result, candidate="nonexistent"
)

def test_no_candidates_defaults_to_candidate_1(self):
eval_result = _make_eval_result(metrics=["m1"], candidate_names=[])
eval_result = eval_result.model_copy(
update={"metadata": common_types.EvaluationRunMetadata()}
)
resolved = _evals_utils._resolve_loss_analysis_config(
eval_result=eval_result
)
assert resolved.metric == "m1"
assert resolved.candidate == "candidate_1"

def test_no_eval_case_results_raises(self):
eval_result = common_types.EvaluationResult()
with pytest.raises(ValueError, match="no metric results"):
_evals_utils._resolve_loss_analysis_config(eval_result=eval_result)


class TestEvals:
"""Unit tests for the GenAI client."""

Expand Down
Loading
Loading