diff --git a/nemo_gym/openai_utils.py b/nemo_gym/openai_utils.py index baae61476..4d9e31c40 100644 --- a/nemo_gym/openai_utils.py +++ b/nemo_gym/openai_utils.py @@ -463,7 +463,7 @@ async def _request(self, **request_kwargs: Dict) -> ClientResponse: return response # We've exited the loop - response.raise_for_status() + await raise_for_status(response) async def _raise_for_status(self, response: ClientResponse, request_kwargs: Dict[str, Any]) -> None: if not response.ok: diff --git a/resources_servers/math_with_judge/configs/dapo17k.yaml b/resources_servers/math_with_judge/configs/dapo17k.yaml index 867f68d24..f13fdec7f 100644 --- a/resources_servers/math_with_judge/configs/dapo17k.yaml +++ b/resources_servers/math_with_judge/configs/dapo17k.yaml @@ -39,3 +39,4 @@ math_with_judge_simple_agent: version: 0.0.1 artifact_fpath: aime24.jsonl license: Apache 2.0 + num_repeats: 32 diff --git a/resources_servers/math_with_judge/data/aime24_validation_metrics.json b/resources_servers/math_with_judge/data/aime24_validation_metrics.json index 8e0a27d8f..d9429be1d 100644 --- a/resources_servers/math_with_judge/data/aime24_validation_metrics.json +++ b/resources_servers/math_with_judge/data/aime24_validation_metrics.json @@ -2,14 +2,14 @@ "name": "validation", "type": "validation", "jsonl_fpath": "resources_servers/math_with_judge/data/aime24_validation.jsonl", - "num_repeats": 1, + "num_repeats": 32, "gitlab_identifier": { "dataset_name": "aime24", "version": "0.0.1", "artifact_fpath": "aime24.jsonl" }, "license": "Apache 2.0", - "Number of examples": 30, + "Number of examples": 960, "Number of tools": { "Total # non-null values": 0, "Average": 0.0, @@ -19,15 +19,15 @@ "Standard deviation": 0.0 }, "Json-dumped number of words (proxy for token count)": { - "Total # non-null values": 30, + "Total # non-null values": 960, "Average": 80.47, "Min": 42.0, "Max": 149.0, "Median": 81.5, - "Standard deviation": 25.11 + "Standard deviation": 24.7 }, "Number of turns": { - "Total # non-null values": 30, + "Total # non-null values": 960, "Average": 1.0, "Min": 1.0, "Max": 1.0, @@ -44,10 +44,10 @@ }, "question": { "unique_count": 30, - "total_count": 30 + "total_count": 960 }, "expected_answer": { "unique_count": 29, - "total_count": 30 + "total_count": 960 } } \ No newline at end of file diff --git a/resources_servers/math_with_judge/data/dapo17k_train_metrics.json b/resources_servers/math_with_judge/data/dapo17k_train_metrics.json index bdd415cf0..f604f8034 100644 --- a/resources_servers/math_with_judge/data/dapo17k_train_metrics.json +++ b/resources_servers/math_with_judge/data/dapo17k_train_metrics.json @@ -23,7 +23,7 @@ "Average": 85.63, "Min": 45.0, "Max": 322.0, - "Median": 80.41, + "Median": 80.33, "Standard deviation": 26.94 }, "Number of turns": { diff --git a/responses_api_agents/simple_agent/app.py b/responses_api_agents/simple_agent/app.py index f3db4393c..4fa6bb6b7 100644 --- a/responses_api_agents/simple_agent/app.py +++ b/responses_api_agents/simple_agent/app.py @@ -101,6 +101,9 @@ async def responses( output = model_response.output new_outputs.extend(output) + if model_response.incomplete_details and model_response.incomplete_details.reason == "max_output_tokens": + break + all_fn_calls: List[NeMoGymResponseFunctionToolCall] = [o for o in output if o.type == "function_call"] all_output_messages: List[NeMoGymResponseOutputMessage] = [ o for o in output if o.type == "message" and o.role == "assistant" diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index b9a61f996..bb7859651 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import re +from copy import deepcopy from time import time -from typing import ClassVar, Dict, List, Optional, Tuple, Union +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union from uuid import uuid4 -from aiohttp.client_exceptions import ClientResponseError from fastapi import Request from pydantic import BaseModel, Field @@ -34,7 +34,6 @@ NeMoGymChatCompletionAssistantMessageParam, NeMoGymChatCompletionCreateParamsNonStreaming, NeMoGymChatCompletionDeveloperMessageParam, - NeMoGymChatCompletionMessage, NeMoGymChatCompletionMessageParam, NeMoGymChatCompletionMessageToolCallFunctionParam, NeMoGymChatCompletionMessageToolCallParam, @@ -66,6 +65,8 @@ class VLLMModelConfig(BaseResponsesAPIModelConfig): uses_reasoning_parser: bool replace_developer_role_with_system: bool = False + chat_template_kwargs: Optional[Dict[str, Any]] = None + def model_post_init(self, context): if isinstance(self.base_url, str): self.base_url = [self.base_url] @@ -132,6 +133,7 @@ async def responses( metadata=body.metadata, instructions=body.instructions, user=body.user, + incomplete_details={"reason": "max_output_tokens"} if choice.finish_reason == "length" else None, ) async def chat_completions( @@ -144,6 +146,8 @@ async def chat_completions( body_dict = body.model_dump(exclude_unset=True) body_dict["model"] = self.config.model + if self.config.chat_template_kwargs: + body_dict["chat_template_kwargs"] = deepcopy(self.config.chat_template_kwargs) session_id = request.session[SESSION_ID_KEY] if session_id not in self._session_id_to_client: @@ -198,45 +202,7 @@ async def chat_completions( else: raise NotImplementedError - try: - chat_completion_dict = await client.create_chat_completion(**create_params) - except ClientResponseError as e: - """ - Example messages for out of context length: - - 1. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/entrypoints/openai/serving_engine.py#L914 - ```json - {"object":"error","message":"This model\'s maximum context length is 32768 tokens. However, you requested 32818 tokens in the messages, Please reduce the length of the messages. None","type":"BadRequestError","param":null,"code":400} - ``` - 2. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/entrypoints/openai/serving_engine.py#L940 - 3. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/entrypoints/openai/serving_engine.py#L948 - 4. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/sampling_params.py#L463 - """ - result_content_str = e.response_content.decode() - - is_out_of_context_length = e.status == 400 and ( - "context length" in result_content_str or "max_tokens" in result_content_str - ) - if is_out_of_context_length: - return NeMoGymChatCompletion( - id="chtcmpl-123", - object="chat.completion", - created=int(time()), - model=self.config.model, - choices=[ - NeMoGymChoice( - index=0, - finish_reason="stop", - message=NeMoGymChatCompletionMessage( - role="assistant", - content=None, - tool_calls=None, - ), - ) - ], - ) - else: - raise e + chat_completion_dict = await client.create_chat_completion(**create_params) choice_dict = chat_completion_dict["choices"][0] if self.config.uses_reasoning_parser: