Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
316d43e
Miscellaneous infra.
pjin-nvidia Nov 13, 2025
4ecd8d3
Remove DEBUG. Comment.
pjin-nvidia Nov 15, 2025
8103dbf
Comment about ray package extra.
pjin-nvidia Nov 15, 2025
dc493d5
The.
pjin-nvidia Nov 15, 2025
f9e5d8f
Merge remote-tracking branch 'origin/main' into pjin/misc-infra
pjin-nvidia Nov 15, 2025
9502d82
Fix test (?).
pjin-nvidia Nov 15, 2025
0475d5e
Initial support for server pyproject.toml (WIP).
pjin-nvidia Nov 15, 2025
d86756b
Fix pyproject.toml check.
pjin-nvidia Nov 15, 2025
79028a6
Working directory Path.
pjin-nvidia Nov 15, 2025
7e62b1d
Install a server venv from pyproject.toml if available.
pjin-nvidia Nov 15, 2025
36efb94
Deprecated vllm_model requirements.txt.
pjin-nvidia Nov 15, 2025
8d49b95
Consistently use dashes in package names.
pjin-nvidia Nov 15, 2025
6fb0a95
Lint.
pjin-nvidia Nov 15, 2025
7231efa
Cleanup.
pjin-nvidia Nov 15, 2025
8fc0d9d
VLLM server spinup.
pjin-nvidia Nov 15, 2025
8975e98
VLLM server host and port.
pjin-nvidia Nov 15, 2025
51ba6fc
Allocate the free port for VLLM in the model server process.
pjin-nvidia Nov 16, 2025
aa97796
Type.
pjin-nvidia Nov 16, 2025
6ec9325
Fix for pyproject.toml (this works lol).
pjin-nvidia Nov 16, 2025
33ec3f9
VLLM server "routing" (just re-using the existing multiple clients).
pjin-nvidia Nov 16, 2025
77cda85
Better order.
pjin-nvidia Nov 16, 2025
7201c8f
Comment.
pjin-nvidia Nov 16, 2025
834d9b9
Default to "mp" backend.
pjin-nvidia Nov 16, 2025
5ee8b57
Cleanup.
pjin-nvidia Nov 16, 2025
10b5295
Cleanup.
pjin-nvidia Nov 16, 2025
e4c5573
Non-async VLLM server heartbeat to avoid early asyncio event loop.
pjin-nvidia Nov 16, 2025
0a8da20
With pyproject.toml, no pre-install command needed.
pjin-nvidia Nov 16, 2025
ad0e2fc
Improved server venv pyproject install that does not use editable.
pjin-nvidia Nov 17, 2025
0436b47
Packaging and setup.
pjin-nvidia Nov 17, 2025
854609f
Revert VLLMModel changes (moving to PR #318).
pjin-nvidia Nov 17, 2025
dc6ffef
One line uv pip install.
pjin-nvidia Nov 18, 2025
e8afd2d
Print the names of servers yet to have finished spinning up.
pjin-nvidia Nov 20, 2025
0142784
Formatting.
pjin-nvidia Nov 20, 2025
a0c0d19
Merge remote-tracking branch 'origin/main' into pjin/misc-infra
pjin-nvidia Nov 26, 2025
0a94c2d
Merge remote-tracking branch 'origin/main' into pjin/misc-infra
pjin-nvidia Dec 1, 2025
66b788d
Revert to just cd into working dir.
pjin-nvidia Dec 2, 2025
a78f226
Deduplicate.
pjin-nvidia Dec 2, 2025
fdb54fe
Also add explicit check for requirements.txt.
pjin-nvidia Dec 2, 2025
3fb2911
Revert format.
pjin-nvidia Dec 2, 2025
b99a5c4
Merge remote-tracking branch 'origin/main' into pjin/misc-infra
pjin-nvidia Dec 9, 2025
fd98595
Sync vllm_model pyproject.toml.
pjin-nvidia Dec 9, 2025
987cf5c
Minimum version of vllm >= 0.11.2.
pjin-nvidia Dec 9, 2025
d11a66a
Merge branch 'main' of https://github.com/NVIDIA-NeMo/Gym into pjin/m…
bxyu-nvidia Dec 10, 2025
912f23a
Log with stdout/stderr redirection.
pjin-nvidia Dec 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 75 additions & 15 deletions nemo_gym/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
from os.path import exists
from pathlib import Path
from signal import SIGINT
from subprocess import Popen
from subprocess import Popen, PIPE
from threading import Thread
from time import sleep
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple

import psutil
import rich
Expand Down Expand Up @@ -59,21 +59,62 @@


def _setup_env_command(dir_path: Path, global_config_dict: DictConfig) -> str: # pragma: no cover
install_cmd = "uv pip install -r requirements.txt"
head_server_deps = global_config_dict[HEAD_SERVER_DEPS_KEY_NAME]
install_cmd += " " + " ".join(head_server_deps)

return f"""cd {dir_path} \\
&& uv venv --allow-existing --python {global_config_dict[PYTHON_VERSION_KEY_NAME]} \\
uv_venv_cmd = f"uv venv --seed --allow-existing --python {global_config_dict[PYTHON_VERSION_KEY_NAME]} .venv"

pyproject_toml = False
requirements_txt = False
try:
with open(f"{dir_path / 'pyproject.toml'}", "r") as _f:
pyproject_toml = True
except OSError:
pass
try:
with open(f"{dir_path / 'requirements.txt'}", "r") as _f:
requirements_txt = True
except OSError:
pass

if pyproject_toml:
install_cmd = f"""uv pip install '-e .' {" ".join(head_server_deps)}"""
elif requirements_txt:
install_cmd = f"""uv pip install -r requirements.txt {" ".join(head_server_deps)}"""
else:
raise RuntimeError(f"Missing pyproject.toml or requirements.txt for uv venv setup in server dir: {dir_path}")

cmd = f"""cd {dir_path} \\
&& {uv_venv_cmd} \\
&& source .venv/bin/activate \\
&& {install_cmd} \\
"""
"""

return cmd

def _run_command(command: str, working_directory: Path) -> Popen: # pragma: no cover

def _run_command(
command: str, working_dir_path: Path, top_level_path: Optional[str] = None
) -> Popen: # pragma: no cover
work_dir = f"{working_dir_path.absolute()}"
custom_env = environ.copy()
custom_env["PYTHONPATH"] = f"{working_directory.absolute()}:{custom_env.get('PYTHONPATH', '')}"
return Popen(command, executable="/bin/bash", shell=True, env=custom_env)
py_path = custom_env.get("PYTHONPATH", None)
if py_path is not None:
custom_env["PYTHONPATH"] = f"{work_dir}:{py_path}"
else:
custom_env["PYTHONPATH"] = work_dir
redirect_stdout = None
redirect_stderr = None
if top_level_path:
redirect_stdout = PIPE
redirect_stderr = PIPE
return Popen(
command,
executable="/bin/bash",
shell=True,
env=custom_env,
stdout=redirect_stdout,
stderr=redirect_stderr,
)


class RunConfig(BaseNeMoGymCLIConfig):
Expand Down Expand Up @@ -193,7 +234,7 @@ def start(self, global_config_dict_parser_config: GlobalConfigDictParserConfig)
{NEMO_GYM_CONFIG_PATH_ENV_VAR_NAME}={shlex.quote(top_level_path)} \\
python {str(entrypoint_fpath)}"""

process = _run_command(command, dir_path)
process = _run_command(command, dir_path, top_level_path)
self._processes[top_level_path] = process

host = server_config_dict.get("host")
Expand Down Expand Up @@ -255,6 +296,18 @@ def poll(self) -> None:

for process_name, process in self._processes.items():
if process.poll() is not None:
proc_out, proc_err = process.communicate()
print(f"Process `{process_name}` finished unexpectedly!")
print(f"Process `{process_name}` stdout:", flush=True)
if isinstance(proc_out, bytes):
print(proc_out.decode("utf-8"), flush=True)
else:
print(proc_out, flush=True)
print(f"Process `{process_name}` stderr:", flush=True)
if isinstance(proc_err, bytes):
print(proc_err.decode("utf-8"), flush=True)
else:
print(proc_err, flush=True)
raise RuntimeError(f"Process `{process_name}` finished unexpectedly!")

def wait_for_spinup(self) -> None:
Expand All @@ -265,11 +318,18 @@ def wait_for_spinup(self) -> None:
self.poll()
statuses = self.check_http_server_statuses()

num_spun_up = statuses.count("success")
num_spun_up = 0
waiting = []
for name, status in statuses:
if status == "success":
num_spun_up += 1
else:
waiting.append(name)
if len(statuses) != num_spun_up:
print(
f"""{num_spun_up} / {len(statuses)} servers ready ({statuses.count("timeout")} timed out, {statuses.count("connection_error")} connection errored, {statuses.count("unknown_error")} had unknown errors).
Waiting for servers to spin up. Sleeping {sleep_interval}s..."""
Waiting for servers to spin up: {waiting}
Sleeping {sleep_interval}s..."""
)
else:
print(f"All {num_spun_up} / {len(statuses)} servers ready! Polling every 60s")
Expand Down Expand Up @@ -311,15 +371,15 @@ async def sleep():
finally:
self.shutdown()

def check_http_server_statuses(self) -> List[ServerStatus]:
def check_http_server_statuses(self) -> List[Tuple[str, ServerStatus]]:
print(
"Checking for HTTP server statuses (you should see some HTTP requests to `/` that may 404. This is expected.)"
)
statuses = []
for server_instance_display_config in self._server_instance_display_configs:
name = server_instance_display_config.config_path
status = self._server_client.poll_for_status(name)
statuses.append(status)
statuses.append((name, status))

return statuses

Expand Down
3 changes: 2 additions & 1 deletion nemo_gym/config_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,8 +312,8 @@ class DatasetConfig(BaseModel):
Literal["MIT"],
Literal["Creative Commons Attribution 4.0 International"],
Literal["Creative Commons Attribution-ShareAlike 4.0 International"],
Literal["NVIDIA Internal Use Only, Do Not Distribute"],
Literal["TBD"],
Literal["MIT"],
]
] = None

Expand All @@ -340,6 +340,7 @@ class Domain(str, Enum):
LONG_CONTEXT = "long_context"
SAFETY = "safety"
GAMES = "games"
TRANSLATION = "translation"
E2E = "e2e"
OTHER = "other"

Expand Down
3 changes: 2 additions & 1 deletion nemo_gym/global_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,8 @@ def parse(self, parse_config: Optional[GlobalConfigDictParserConfig] = None) ->
# Constrain sensitive package versions
global_config_dict[HEAD_SERVER_DEPS_KEY_NAME] = [
# The ray version is very sensitive. The children ray versions must exactly match those of the parent ray.
f"ray=={ray_version}",
# The ray extra [default] should also exactly match the extra in the top-level Gym pyproject.toml.
f"ray[default]=={ray_version}",
# OpenAI version is also sensitive since it changes so often and may introduce subtle incompatibilities.
f"openai=={openai_version}",
]
Expand Down
34 changes: 34 additions & 0 deletions responses_api_models/vllm_model/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

[project]
name = "vllm-model"
version = "0.2.0rc0"
requires-python = ">=3.12"
dependencies = [
"nemo-gym[dev]",
"vllm>=0.11.2",
]

[build-system]
build-backend = "setuptools.build_meta"
requires = ["setuptools>=61", "setuptools-scm"]

[tool.setuptools.packages.find]
where = [".."]
include = ["vllm_model"]

[tool.uv.sources]
nemo-gym = { path = "../..", editable = true }
1 change: 0 additions & 1 deletion responses_api_models/vllm_model/requirements.txt

This file was deleted.

2 changes: 1 addition & 1 deletion tests/unit_tests/test_global_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _mock_versions_for_testing(self, monkeypatch: MonkeyPatch) -> Dict[str, str]
monkeypatch.setattr(nemo_gym.global_config, "python_version", python_version_mock)

return {
"head_server_deps": ["ray==test ray version", "openai==test openai version"],
"head_server_deps": ["ray[default]==test ray version", "openai==test openai version"],
"python_version": "test python version",
}

Expand Down
Loading