diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py index f444a59dd0..4f09093fa7 100644 --- a/tests/unit/vertexai/genai/test_evals.py +++ b/tests/unit/vertexai/genai/test_evals.py @@ -20,6 +20,7 @@ import re import statistics import sys +import tempfile import unittest from unittest import mock @@ -44,7 +45,6 @@ import pandas as pd import pytest - _TEST_PROJECT = "test-project" _TEST_LOCATION = "us-central1" @@ -755,27 +755,27 @@ def test_inference_with_local_destination( mock_generate_content_response ) - local_dest_dir = "/tmp/test/output_dir" - config = vertexai_genai_types.EvalRunInferenceConfig(dest=local_dest_dir) + with tempfile.TemporaryDirectory() as local_dest_dir: + config = vertexai_genai_types.EvalRunInferenceConfig(dest=local_dest_dir) - inference_result = self.client.evals.run_inference( - model="gemini-pro", src=mock_df, config=config - ) + inference_result = self.client.evals.run_inference( + model="gemini-pro", src=mock_df, config=config + ) - mock_makedirs.assert_called_once_with(local_dest_dir, exist_ok=True) - expected_save_path = os.path.join(local_dest_dir, "inference_results.jsonl") - mock_df_to_json.assert_called_once_with( - expected_save_path, orient="records", lines=True - ) - expected_df = pd.DataFrame( - { - "prompt": ["local save"], - "response": ["local response"], - } - ) - pd.testing.assert_frame_equal(inference_result.eval_dataset_df, expected_df) - assert inference_result.candidate_name == "gemini-pro" - assert inference_result.gcs_source is None + mock_makedirs.assert_called_once_with(local_dest_dir, exist_ok=True) + expected_save_path = os.path.join(local_dest_dir, "inference_results.jsonl") + mock_df_to_json.assert_called_once_with( + expected_save_path, orient="records", lines=True + ) + expected_df = pd.DataFrame( + { + "prompt": ["local save"], + "response": ["local response"], + } + ) + pd.testing.assert_frame_equal(inference_result.eval_dataset_df, expected_df) + assert inference_result.candidate_name == "gemini-pro" + assert inference_result.gcs_source is None @mock.patch.object(_evals_common, "Models") @mock.patch.object(_evals_utils, "EvalDatasetLoader") @@ -816,201 +816,199 @@ def test_inference_from_request_column_save_to_local_dir( mock_generate_content_responses ) - local_dest_dir = "/tmp/test_output_dir" - config = vertexai_genai_types.EvalRunInferenceConfig(dest=local_dest_dir) + with tempfile.TemporaryDirectory() as local_dest_dir: + config = vertexai_genai_types.EvalRunInferenceConfig(dest=local_dest_dir) - inference_result = self.client.evals.run_inference( - model="gemini-pro", src=mock_df, config=config - ) + inference_result = self.client.evals.run_inference( + model="gemini-pro", src=mock_df, config=config + ) - mock_models.return_value.generate_content.assert_has_calls( - [ - mock.call( - model="gemini-pro", - contents="req 1", - config=genai_types.GenerateContentConfig(), - ), - mock.call( - model="gemini-pro", - contents="req 2", - config=genai_types.GenerateContentConfig(), + mock_models.return_value.generate_content.assert_has_calls( + [ + mock.call( + model="gemini-pro", + contents="req 1", + config=genai_types.GenerateContentConfig(), + ), + mock.call( + model="gemini-pro", + contents="req 2", + config=genai_types.GenerateContentConfig(), + ), + ], + any_order=True, + ) + expected_df = pd.DataFrame( + { + "prompt": ["prompt 1", "prompt 2"], + "request": ["req 1", "req 2"], + "response": ["resp 1", "resp 2"], + } + ) + pd.testing.assert_frame_equal( + inference_result.eval_dataset_df.sort_values(by="request").reset_index( + drop=True ), - ], - any_order=True, - ) - expected_df = pd.DataFrame( - { - "prompt": ["prompt 1", "prompt 2"], - "request": ["req 1", "req 2"], - "response": ["resp 1", "resp 2"], - } - ) - pd.testing.assert_frame_equal( - inference_result.eval_dataset_df.sort_values(by="request").reset_index( - drop=True - ), - expected_df.sort_values(by="request").reset_index(drop=True), - ) + expected_df.sort_values(by="request").reset_index(drop=True), + ) - saved_file_path = os.path.join(local_dest_dir, "inference_results.jsonl") - with open(saved_file_path, "r") as f: - saved_records = [json.loads(line) for line in f] - expected_records = expected_df.to_dict(orient="records") - assert sorted(saved_records, key=lambda x: x["request"]) == sorted( - expected_records, key=lambda x: x["request"] - ) - os.remove(saved_file_path) - os.rmdir(local_dest_dir) - assert inference_result.candidate_name == "gemini-pro" - assert inference_result.gcs_source is None + saved_file_path = os.path.join(local_dest_dir, "inference_results.jsonl") + with open(saved_file_path, "r") as f: + saved_records = [json.loads(line) for line in f] + expected_records = expected_df.to_dict(orient="records") + assert sorted(saved_records, key=lambda x: x["request"]) == sorted( + expected_records, key=lambda x: x["request"] + ) + assert inference_result.candidate_name == "gemini-pro" + assert inference_result.gcs_source is None @mock.patch.object(_evals_common, "Models") def test_inference_from_local_jsonl_file(self, mock_models): - local_src_path = "/tmp/input.jsonl" - input_records = [ - {"prompt": "prompt 1", "other_col": "val 1"}, - {"prompt": "prompt 2", "other_col": "val 2"}, - ] - with open(local_src_path, "w") as f: - for record in input_records: - f.write(json.dumps(record) + "\n") - - mock_generate_content_responses = [ - genai_types.GenerateContentResponse( - candidates=[ - genai_types.Candidate( - content=genai_types.Content( - parts=[genai_types.Part(text="resp 1")] - ), - finish_reason=genai_types.FinishReason.STOP, - ) - ], - prompt_feedback=None, - ), - genai_types.GenerateContentResponse( - candidates=[ - genai_types.Candidate( - content=genai_types.Content( - parts=[genai_types.Part(text="resp 2")] - ), - finish_reason=genai_types.FinishReason.STOP, - ) - ], - prompt_feedback=None, - ), - ] - mock_models.return_value.generate_content.side_effect = ( - mock_generate_content_responses - ) + with tempfile.TemporaryDirectory() as temp_dir: + local_src_path = os.path.join(temp_dir, "input.jsonl") + input_records = [ + {"prompt": "prompt 1", "other_col": "val 1"}, + {"prompt": "prompt 2", "other_col": "val 2"}, + ] + with open(local_src_path, "w") as f: + for record in input_records: + f.write(json.dumps(record) + "\n") + + mock_generate_content_responses = [ + genai_types.GenerateContentResponse( + candidates=[ + genai_types.Candidate( + content=genai_types.Content( + parts=[genai_types.Part(text="resp 1")] + ), + finish_reason=genai_types.FinishReason.STOP, + ) + ], + prompt_feedback=None, + ), + genai_types.GenerateContentResponse( + candidates=[ + genai_types.Candidate( + content=genai_types.Content( + parts=[genai_types.Part(text="resp 2")] + ), + finish_reason=genai_types.FinishReason.STOP, + ) + ], + prompt_feedback=None, + ), + ] + mock_models.return_value.generate_content.side_effect = ( + mock_generate_content_responses + ) - inference_result = self.client.evals.run_inference( - model="gemini-pro", src=local_src_path - ) + inference_result = self.client.evals.run_inference( + model="gemini-pro", src=local_src_path + ) - expected_df = pd.DataFrame( - { - "prompt": ["prompt 1", "prompt 2"], - "other_col": ["val 1", "val 2"], - "response": ["resp 1", "resp 2"], - } - ) - pd.testing.assert_frame_equal( - inference_result.eval_dataset_df.sort_values(by="prompt").reset_index( - drop=True - ), - expected_df.sort_values(by="prompt").reset_index(drop=True), - ) - mock_models.return_value.generate_content.assert_has_calls( - [ - mock.call( - model="gemini-pro", - contents="prompt 1", - config=genai_types.GenerateContentConfig(), - ), - mock.call( - model="gemini-pro", - contents="prompt 2", - config=genai_types.GenerateContentConfig(), + expected_df = pd.DataFrame( + { + "prompt": ["prompt 1", "prompt 2"], + "other_col": ["val 1", "val 2"], + "response": ["resp 1", "resp 2"], + } + ) + pd.testing.assert_frame_equal( + inference_result.eval_dataset_df.sort_values(by="prompt").reset_index( + drop=True ), - ], - any_order=True, - ) - os.remove(local_src_path) - assert inference_result.candidate_name == "gemini-pro" - assert inference_result.gcs_source is None + expected_df.sort_values(by="prompt").reset_index(drop=True), + ) + mock_models.return_value.generate_content.assert_has_calls( + [ + mock.call( + model="gemini-pro", + contents="prompt 1", + config=genai_types.GenerateContentConfig(), + ), + mock.call( + model="gemini-pro", + contents="prompt 2", + config=genai_types.GenerateContentConfig(), + ), + ], + any_order=True, + ) + assert inference_result.candidate_name == "gemini-pro" + assert inference_result.gcs_source is None @pytest.mark.skip(reason="currently flakey") @mock.patch.object(_evals_common, "Models") def test_inference_from_local_csv_file(self, mock_models): - local_src_path = "/tmp/input.csv" - input_df = pd.DataFrame( - {"prompt": ["prompt 1", "prompt 2"], "other_col": ["val 1", "val 2"]} - ) - input_df.to_csv(local_src_path, index=False) + with tempfile.TemporaryDirectory() as temp_dir: + local_src_path = os.path.join(temp_dir, "input.csv") + input_df = pd.DataFrame( + {"prompt": ["prompt 1", "prompt 2"], "other_col": ["val 1", "val 2"]} + ) + input_df.to_csv(local_src_path, index=False) - mock_generate_content_responses = [ - genai_types.GenerateContentResponse( - candidates=[ - genai_types.Candidate( - content=genai_types.Content( - parts=[genai_types.Part(text="resp 1")] - ), - finish_reason=genai_types.FinishReason.STOP, - ) - ], - prompt_feedback=None, - ), - genai_types.GenerateContentResponse( - candidates=[ - genai_types.Candidate( - content=genai_types.Content( - parts=[genai_types.Part(text="resp 2")] - ), - finish_reason=genai_types.FinishReason.STOP, - ) - ], - prompt_feedback=None, - ), - ] - mock_models.return_value.generate_content.side_effect = ( - mock_generate_content_responses - ) + mock_generate_content_responses = [ + genai_types.GenerateContentResponse( + candidates=[ + genai_types.Candidate( + content=genai_types.Content( + parts=[genai_types.Part(text="resp 1")] + ), + finish_reason=genai_types.FinishReason.STOP, + ) + ], + prompt_feedback=None, + ), + genai_types.GenerateContentResponse( + candidates=[ + genai_types.Candidate( + content=genai_types.Content( + parts=[genai_types.Part(text="resp 2")] + ), + finish_reason=genai_types.FinishReason.STOP, + ) + ], + prompt_feedback=None, + ), + ] + mock_models.return_value.generate_content.side_effect = ( + mock_generate_content_responses + ) - inference_result = self.client.evals.run_inference( - model="gemini-pro", src=local_src_path - ) + inference_result = self.client.evals.run_inference( + model="gemini-pro", src=local_src_path + ) - expected_df = pd.DataFrame( - { - "prompt": ["prompt 1", "prompt 2"], - "other_col": ["val 1", "val 2"], - "response": ["resp 1", "resp 2"], - } - ) - pd.testing.assert_frame_equal( - inference_result.eval_dataset_df.sort_values(by="prompt").reset_index( - drop=True - ), - expected_df.sort_values(by="prompt").reset_index(drop=True), - ) - mock_models.return_value.generate_content.assert_has_calls( - [ - mock.call( - model="gemini-pro", - contents="prompt 1", - config=genai_types.GenerateContentConfig(), - ), - mock.call( - model="gemini-pro", - contents="prompt 2", - config=genai_types.GenerateContentConfig(), + expected_df = pd.DataFrame( + { + "prompt": ["prompt 1", "prompt 2"], + "other_col": ["val 1", "val 2"], + "response": ["resp 1", "resp 2"], + } + ) + pd.testing.assert_frame_equal( + inference_result.eval_dataset_df.sort_values(by="prompt").reset_index( + drop=True ), - ], - any_order=True, - ) - os.remove(local_src_path) - assert inference_result.candidate_name == "gemini-pro" - assert inference_result.gcs_source is None + expected_df.sort_values(by="prompt").reset_index(drop=True), + ) + mock_models.return_value.generate_content.assert_has_calls( + [ + mock.call( + model="gemini-pro", + contents="prompt 1", + config=genai_types.GenerateContentConfig(), + ), + mock.call( + model="gemini-pro", + contents="prompt 2", + config=genai_types.GenerateContentConfig(), + ), + ], + any_order=True, + ) + assert inference_result.candidate_name == "gemini-pro" + assert inference_result.gcs_source is None @mock.patch.object(_evals_common, "Models") @mock.patch.object(_evals_utils, "EvalDatasetLoader") @@ -2079,6 +2077,99 @@ def test_has_tool_call_with_agent_event(self): @pytest.mark.usefixtures("google_auth_mock") +class TestRunAgent: + """Unit tests for the _run_agent function.""" + + @mock.patch.object(_evals_common, "_execute_inference_concurrently") + def test_run_agent_rewrites_gemini_3_model_name( + self, mock_execute_inference_concurrently, mock_api_client_fixture + ): + mock_execute_inference_concurrently.return_value = [] + user_simulator_config = vertexai_genai_types.evals.UserSimulatorConfig( + model_name="gemini-3-preview" + ) + prompt_dataset = pd.DataFrame({"prompt": ["prompt1"]}) + with mock.patch.dict(os.environ, clear=True): + os.environ["GOOGLE_CLOUD_LOCATION"] = "us-central1" + + def mock_execute(*args, **kwargs): + assert os.environ["GOOGLE_CLOUD_LOCATION"] == "global" + return [] + + mock_execute_inference_concurrently.side_effect = mock_execute + + _evals_common._run_agent( + api_client=mock_api_client_fixture, + agent_engine=mock.Mock(), + agent=None, + prompt_dataset=prompt_dataset, + user_simulator_config=user_simulator_config, + allow_cross_region_model=True, + ) + + assert ( + user_simulator_config.model_name + == f"projects/{mock_api_client_fixture.project}/locations/global/publishers/google/models/gemini-3-preview" + ) + assert os.environ.get("GOOGLE_CLOUD_LOCATION") == "us-central1" + + @mock.patch.object(_evals_common, "_execute_inference_concurrently") + def test_run_agent_raises_error_if_gemini_3_and_allow_cross_region_model_false( + self, mock_execute_inference_concurrently, mock_api_client_fixture + ): + user_simulator_config = vertexai_genai_types.evals.UserSimulatorConfig( + model_name="gemini-3-preview" + ) + prompt_dataset = pd.DataFrame({"prompt": ["prompt1"]}) + with mock.patch.dict(os.environ, clear=True): + os.environ["GOOGLE_CLOUD_LOCATION"] = "us-central1" + + with pytest.raises( + ValueError, + match="The model 'gemini-3-preview' is currently only available in the 'global' region.", + ): + _evals_common._run_agent( + api_client=mock_api_client_fixture, + agent_engine=mock.Mock(), + agent=None, + prompt_dataset=prompt_dataset, + user_simulator_config=user_simulator_config, + allow_cross_region_model=False, + ) + + @mock.patch.object(_evals_common, "_execute_inference_concurrently") + def test_run_agent_rewrites_gemini_3_model_name_empty_env( + self, mock_execute_inference_concurrently, mock_api_client_fixture + ): + mock_execute_inference_concurrently.return_value = [] + user_simulator_config = vertexai_genai_types.evals.UserSimulatorConfig( + model_name="gemini-3-preview" + ) + prompt_dataset = pd.DataFrame({"prompt": ["prompt1"]}) + with mock.patch.dict(os.environ, clear=True): + + def mock_execute(*args, **kwargs): + assert os.environ["GOOGLE_CLOUD_LOCATION"] == "global" + return [] + + mock_execute_inference_concurrently.side_effect = mock_execute + + _evals_common._run_agent( + api_client=mock_api_client_fixture, + agent_engine=mock.Mock(), + agent=None, + prompt_dataset=prompt_dataset, + user_simulator_config=user_simulator_config, + allow_cross_region_model=True, + ) + + assert ( + user_simulator_config.model_name + == f"projects/{mock_api_client_fixture.project}/locations/global/publishers/google/models/gemini-3-preview" + ) + assert "GOOGLE_CLOUD_LOCATION" not in os.environ + + class TestRunAgentInternal: """Unit tests for the _run_agent_internal function.""" diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index aace1f4ead..b51017a889 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -1143,6 +1143,7 @@ def _execute_inference( prompt_template: Optional[Union[str, types.PromptTemplateOrDict]] = None, location: Optional[str] = None, user_simulator_config: Optional[types.evals.UserSimulatorConfig] = None, + allow_cross_region_model: bool = False, ) -> pd.DataFrame: """Executes inference on a given dataset using the specified model. @@ -1250,6 +1251,7 @@ def _execute_inference( agent=agent, prompt_dataset=prompt_dataset, user_simulator_config=user_simulator_config, + allow_cross_region_model=allow_cross_region_model, ) end_time = time.time() logger.info("Agent Run completed in %.2f seconds.", end_time - start_time) @@ -1823,6 +1825,7 @@ def _run_agent_internal( agent: Optional[LlmAgent], prompt_dataset: pd.DataFrame, user_simulator_config: Optional[types.evals.UserSimulatorConfig] = None, + allow_cross_region_model: bool = False, ) -> pd.DataFrame: """Runs an agent.""" raw_responses = _run_agent( @@ -1831,6 +1834,7 @@ def _run_agent_internal( agent=agent, prompt_dataset=prompt_dataset, user_simulator_config=user_simulator_config, + allow_cross_region_model=allow_cross_region_model, ) processed_intermediate_events = [] processed_responses = [] @@ -1872,6 +1876,7 @@ def _run_agent( agent: Optional[LlmAgent], prompt_dataset: pd.DataFrame, user_simulator_config: Optional[types.evals.UserSimulatorConfig] = None, + allow_cross_region_model: bool = False, ) -> list[ Union[ list[dict[str, Any]], @@ -1880,28 +1885,60 @@ def _run_agent( ] ]: """Internal helper to run inference using Gemini model with concurrency.""" - if agent_engine: - return _execute_inference_concurrently( - api_client=api_client, - agent_engine=agent_engine, - prompt_dataset=prompt_dataset, - progress_desc="Agent Run", - gemini_config=None, - user_simulator_config=None, - inference_fn=_execute_agent_run_with_retry, - ) - elif agent: - return _execute_inference_concurrently( - api_client=api_client, - agent=agent, - prompt_dataset=prompt_dataset, - progress_desc="Local Agent Run", - gemini_config=None, - user_simulator_config=user_simulator_config, - inference_fn=_execute_local_agent_run_with_retry, - ) - else: - raise ValueError("Neither agent_engine nor agent is provided.") + original_location = os.environ.get("GOOGLE_CLOUD_LOCATION") + location_overridden = False + + if user_simulator_config and user_simulator_config.model_name: + model_name = user_simulator_config.model_name + if model_name.startswith("gemini-3") and "/" not in model_name: + current_location = original_location or api_client.location or "us-central1" + if current_location != "global" and not allow_cross_region_model: + raise ValueError( + f"The model '{model_name}' is currently only available in the" + " 'global' region. Because this request originated in" + f" '{current_location}', you must explicitly set " + "allow_cross_region_model=True to allow your data to be routed outside" + " of your request's region." + ) + + logger.warning( + "Model %s is only available in the global region. Routing to global.", + model_name, + ) + user_simulator_config.model_name = f"projects/{api_client.project}/locations/global/publishers/google/models/{model_name}" + if original_location != "global": + os.environ["GOOGLE_CLOUD_LOCATION"] = "global" + location_overridden = True + + try: + if agent_engine: + return _execute_inference_concurrently( + api_client=api_client, + agent_engine=agent_engine, + prompt_dataset=prompt_dataset, + progress_desc="Agent Run", + gemini_config=None, + user_simulator_config=None, + inference_fn=_execute_agent_run_with_retry, + ) + elif agent: + return _execute_inference_concurrently( + api_client=api_client, + agent=agent, + prompt_dataset=prompt_dataset, + progress_desc="Local Agent Run", + gemini_config=None, + user_simulator_config=user_simulator_config, + inference_fn=_execute_local_agent_run_with_retry, + ) + else: + raise ValueError("Neither agent_engine nor agent is provided.") + finally: + if location_overridden: + if original_location is None: + del os.environ["GOOGLE_CLOUD_LOCATION"] + else: + os.environ["GOOGLE_CLOUD_LOCATION"] = original_location def _execute_agent_run_with_retry( diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index ece5486c35..ce516c8cf6 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -1957,6 +1957,7 @@ def run_inference( - dest: The destination path for storage of the inference results. - prompt_template: The template string to use for constructing prompts. - generate_content_config: The config for the Gemini generate content call. + - allow_cross_region_model: Opt-in flag to authorize cross-region routing for LLM models. Returns: The evaluation dataset. @@ -1992,6 +1993,7 @@ def run_inference( location=location, config=config.generate_content_config, user_simulator_config=getattr(config, "user_simulator_config", None), + allow_cross_region_model=getattr(config, "allow_cross_region_model", False), ) def evaluate( @@ -2437,6 +2439,8 @@ def create_evaluation_run( If `agent_info` is provided without `inference_configs`, this config is used to automatically construct the inference configuration. If not specified, or if `max_turn` is not set, `max_turn` defaults to 5. + The `model_name` inside this config can be either a full model path or a + short model name, e.g. `gemini-3-preview-flash`. inference_configs: The candidate to inference config map for the evaluation run. The key is the candidate name, and the value is the inference config. If provided, `agent_info` must be None. If omitted and `agent_info` is provided, @@ -3928,6 +3932,8 @@ async def create_evaluation_run( If `agent_info` is provided without `inference_configs`, this config is used to automatically construct the inference configuration. If not specified, or if `max_turn` is not set, `max_turn` defaults to 5. + The `model_name` inside this config can be either a full model path or a + short model name, e.g. `gemini-3-preview-flash`. inference_configs: The candidate to inference config map for the evaluation run. The key is the candidate name, and the value is the inference config. If provided, `agent_info` must be None. If omitted and `agent_info` is provided, diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index fdd1262d4c..535fc9cb26 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -16011,6 +16011,10 @@ class EvalRunInferenceConfig(_common.BaseModel): description="""Configuration for user simulation in multi-turn agent scraping. If provided, and the dataset contains conversation plans, user simulation will be triggered.""", ) + allow_cross_region_model: Optional[bool] = Field( + default=None, + description="""Opt-in flag to authorize cross-region routing for LLM models.""", + ) class EvalRunInferenceConfigDict(TypedDict, total=False): @@ -16029,6 +16033,9 @@ class EvalRunInferenceConfigDict(TypedDict, total=False): """Configuration for user simulation in multi-turn agent scraping. If provided, and the dataset contains conversation plans, user simulation will be triggered.""" + allow_cross_region_model: Optional[bool] + """Opt-in flag to authorize cross-region routing for LLM models.""" + EvalRunInferenceConfigOrDict = Union[EvalRunInferenceConfig, EvalRunInferenceConfigDict]