diff --git a/nemo_gym/cli.py b/nemo_gym/cli.py index a64f0c11..0ad353bb 100644 --- a/nemo_gym/cli.py +++ b/nemo_gym/cli.py @@ -25,10 +25,10 @@ from os.path import exists from pathlib import Path from signal import SIGINT -from subprocess import Popen +from subprocess import Popen, PIPE from threading import Thread from time import sleep -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import psutil import rich @@ -59,21 +59,62 @@ def _setup_env_command(dir_path: Path, global_config_dict: DictConfig) -> str: # pragma: no cover - install_cmd = "uv pip install -r requirements.txt" head_server_deps = global_config_dict[HEAD_SERVER_DEPS_KEY_NAME] - install_cmd += " " + " ".join(head_server_deps) - return f"""cd {dir_path} \\ - && uv venv --allow-existing --python {global_config_dict[PYTHON_VERSION_KEY_NAME]} \\ + uv_venv_cmd = f"uv venv --seed --allow-existing --python {global_config_dict[PYTHON_VERSION_KEY_NAME]} .venv" + + pyproject_toml = False + requirements_txt = False + try: + with open(f"{dir_path / 'pyproject.toml'}", "r") as _f: + pyproject_toml = True + except OSError: + pass + try: + with open(f"{dir_path / 'requirements.txt'}", "r") as _f: + requirements_txt = True + except OSError: + pass + + if pyproject_toml: + install_cmd = f"""uv pip install '-e .' {" ".join(head_server_deps)}""" + elif requirements_txt: + install_cmd = f"""uv pip install -r requirements.txt {" ".join(head_server_deps)}""" + else: + raise RuntimeError(f"Missing pyproject.toml or requirements.txt for uv venv setup in server dir: {dir_path}") + + cmd = f"""cd {dir_path} \\ + && {uv_venv_cmd} \\ && source .venv/bin/activate \\ && {install_cmd} \\ - """ + """ + return cmd -def _run_command(command: str, working_directory: Path) -> Popen: # pragma: no cover + +def _run_command( + command: str, working_dir_path: Path, top_level_path: Optional[str] = None +) -> Popen: # pragma: no cover + work_dir = f"{working_dir_path.absolute()}" custom_env = environ.copy() - custom_env["PYTHONPATH"] = f"{working_directory.absolute()}:{custom_env.get('PYTHONPATH', '')}" - return Popen(command, executable="/bin/bash", shell=True, env=custom_env) + py_path = custom_env.get("PYTHONPATH", None) + if py_path is not None: + custom_env["PYTHONPATH"] = f"{work_dir}:{py_path}" + else: + custom_env["PYTHONPATH"] = work_dir + redirect_stdout = None + redirect_stderr = None + if top_level_path: + redirect_stdout = PIPE + redirect_stderr = PIPE + return Popen( + command, + executable="/bin/bash", + shell=True, + env=custom_env, + stdout=redirect_stdout, + stderr=redirect_stderr, + ) class RunConfig(BaseNeMoGymCLIConfig): @@ -193,7 +234,7 @@ def start(self, global_config_dict_parser_config: GlobalConfigDictParserConfig) {NEMO_GYM_CONFIG_PATH_ENV_VAR_NAME}={shlex.quote(top_level_path)} \\ python {str(entrypoint_fpath)}""" - process = _run_command(command, dir_path) + process = _run_command(command, dir_path, top_level_path) self._processes[top_level_path] = process host = server_config_dict.get("host") @@ -255,6 +296,18 @@ def poll(self) -> None: for process_name, process in self._processes.items(): if process.poll() is not None: + proc_out, proc_err = process.communicate() + print(f"Process `{process_name}` finished unexpectedly!") + print(f"Process `{process_name}` stdout:", flush=True) + if isinstance(proc_out, bytes): + print(proc_out.decode("utf-8"), flush=True) + else: + print(proc_out, flush=True) + print(f"Process `{process_name}` stderr:", flush=True) + if isinstance(proc_err, bytes): + print(proc_err.decode("utf-8"), flush=True) + else: + print(proc_err, flush=True) raise RuntimeError(f"Process `{process_name}` finished unexpectedly!") def wait_for_spinup(self) -> None: @@ -265,11 +318,18 @@ def wait_for_spinup(self) -> None: self.poll() statuses = self.check_http_server_statuses() - num_spun_up = statuses.count("success") + num_spun_up = 0 + waiting = [] + for name, status in statuses: + if status == "success": + num_spun_up += 1 + else: + waiting.append(name) if len(statuses) != num_spun_up: print( f"""{num_spun_up} / {len(statuses)} servers ready ({statuses.count("timeout")} timed out, {statuses.count("connection_error")} connection errored, {statuses.count("unknown_error")} had unknown errors). -Waiting for servers to spin up. Sleeping {sleep_interval}s...""" +Waiting for servers to spin up: {waiting} +Sleeping {sleep_interval}s...""" ) else: print(f"All {num_spun_up} / {len(statuses)} servers ready! Polling every 60s") @@ -311,7 +371,7 @@ async def sleep(): finally: self.shutdown() - def check_http_server_statuses(self) -> List[ServerStatus]: + def check_http_server_statuses(self) -> List[Tuple[str, ServerStatus]]: print( "Checking for HTTP server statuses (you should see some HTTP requests to `/` that may 404. This is expected.)" ) @@ -319,7 +379,7 @@ def check_http_server_statuses(self) -> List[ServerStatus]: for server_instance_display_config in self._server_instance_display_configs: name = server_instance_display_config.config_path status = self._server_client.poll_for_status(name) - statuses.append(status) + statuses.append((name, status)) return statuses diff --git a/nemo_gym/config_types.py b/nemo_gym/config_types.py index b340081c..355c0189 100644 --- a/nemo_gym/config_types.py +++ b/nemo_gym/config_types.py @@ -312,8 +312,8 @@ class DatasetConfig(BaseModel): Literal["MIT"], Literal["Creative Commons Attribution 4.0 International"], Literal["Creative Commons Attribution-ShareAlike 4.0 International"], + Literal["NVIDIA Internal Use Only, Do Not Distribute"], Literal["TBD"], - Literal["MIT"], ] ] = None @@ -340,6 +340,7 @@ class Domain(str, Enum): LONG_CONTEXT = "long_context" SAFETY = "safety" GAMES = "games" + TRANSLATION = "translation" E2E = "e2e" OTHER = "other" diff --git a/nemo_gym/global_config.py b/nemo_gym/global_config.py index d9cc3687..d69b3025 100644 --- a/nemo_gym/global_config.py +++ b/nemo_gym/global_config.py @@ -261,7 +261,8 @@ def parse(self, parse_config: Optional[GlobalConfigDictParserConfig] = None) -> # Constrain sensitive package versions global_config_dict[HEAD_SERVER_DEPS_KEY_NAME] = [ # The ray version is very sensitive. The children ray versions must exactly match those of the parent ray. - f"ray=={ray_version}", + # The ray extra [default] should also exactly match the extra in the top-level Gym pyproject.toml. + f"ray[default]=={ray_version}", # OpenAI version is also sensitive since it changes so often and may introduce subtle incompatibilities. f"openai=={openai_version}", ] diff --git a/responses_api_models/vllm_model/pyproject.toml b/responses_api_models/vllm_model/pyproject.toml new file mode 100644 index 00000000..3459458b --- /dev/null +++ b/responses_api_models/vllm_model/pyproject.toml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[project] +name = "vllm-model" +version = "0.2.0rc0" +requires-python = ">=3.12" +dependencies = [ + "nemo-gym[dev]", + "vllm>=0.11.2", +] + +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools>=61", "setuptools-scm"] + +[tool.setuptools.packages.find] +where = [".."] +include = ["vllm_model"] + +[tool.uv.sources] +nemo-gym = { path = "../..", editable = true } diff --git a/responses_api_models/vllm_model/requirements.txt b/responses_api_models/vllm_model/requirements.txt deleted file mode 100644 index 00ed8321..00000000 --- a/responses_api_models/vllm_model/requirements.txt +++ /dev/null @@ -1 +0,0 @@ --e nemo-gym[dev] @ ../../ diff --git a/tests/unit_tests/test_global_config.py b/tests/unit_tests/test_global_config.py index 67123fb0..2f9b8329 100644 --- a/tests/unit_tests/test_global_config.py +++ b/tests/unit_tests/test_global_config.py @@ -42,7 +42,7 @@ def _mock_versions_for_testing(self, monkeypatch: MonkeyPatch) -> Dict[str, str] monkeypatch.setattr(nemo_gym.global_config, "python_version", python_version_mock) return { - "head_server_deps": ["ray==test ray version", "openai==test openai version"], + "head_server_deps": ["ray[default]==test ray version", "openai==test openai version"], "python_version": "test python version", }