From ed952fe445a7993e4b6e73addd94f3c9fa4edbca Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 13 Jan 2026 15:23:04 +0100 Subject: [PATCH 1/5] updated bfcl instructions --- demos/continuous_batching/accuracy/README.md | 12 ++-- .../accuracy/gorilla.patch | 64 +++++++++++-------- 2 files changed, 45 insertions(+), 31 deletions(-) diff --git a/demos/continuous_batching/accuracy/README.md b/demos/continuous_batching/accuracy/README.md index 57dc6c546e..db76509fc6 100644 --- a/demos/continuous_batching/accuracy/README.md +++ b/demos/continuous_batching/accuracy/README.md @@ -112,21 +112,23 @@ Use [Berkeley function call leaderboard ](https://github.com/ShishirPatil/gorill ```text git clone https://github.com/ShishirPatil/gorilla cd gorilla/berkeley-function-call-leaderboard -git checkout cd9429ccf3d4d04156affe883c495b3b047e6b64 +git checkout 9b8a5202544f49a846aced185a340361231ef3e1 curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/continuous_batching/accuracy/gorilla.patch | git apply -v -pip install -e . +pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu" ``` The commands below assumes the models is deployed with the name `ovms-model`. It must match the name set in the `bfcl_eval/constants/model_config.py`. ```text export OPENAI_BASE_URL=http://localhost:8000/v3 -bfcl generate --model ovms-model --test-category simple,multiple --temperature 0.0 --num-threads 100 -o --result-dir model_name_dir +export CHAT_TEMPLATE_KWARGS='{"enable_thinking":false, "reasoning_effort":"low"}' + +bfcl generate --model ovms-model --test-category simple_python,multiple --temperature 0.0 --num-threads 100 -o --result-dir model_name_dir bfcl evaluate --model ovms-model --result-dir model_name_dir ``` Alternatively, use the model name `ovms-model-stream` to run the tests with stream requests. The results should be the same. ```text export OPENAI_BASE_URL=http://localhost:8000/v3 -bfcl generate --model ovms-model-stream --test-category simple,multiple --temperature 0.0 --num-threads 100 -o --result-dir model_name_dir +bfcl generate --model ovms-model-stream --test-category simple_python,multiple --temperature 0.0 --num-threads 100 -o --result-dir model_name_dir bfcl evaluate --model ovms-model-stream --result-dir model_name_dir ``` @@ -134,7 +136,7 @@ bfcl evaluate --model ovms-model-stream --result-dir model_name_dir The output artifacts will be stored in `result` and `scores`. For example: ```text -cat score/openvino-qwen3-8b-int4-FC/BFCL_v3_simple_score.json | head -1 +cat score/openvino-qwen3-8b-int4-FC/BFCL_v3_simple_python_score.json | head -1 {"accuracy": 0.95, "correct_count": 380, "total_count": 400} ``` Those results can be compared with the reference from the [berkeley leaderbaord](https://gorilla.cs.berkeley.edu/leaderboard.html#leaderboard). diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index 705f513843..48af0f5eac 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -1,10 +1,10 @@ diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -index 73731c0..b6bbf48 100644 +index bb625d2..7204adb 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -@@ -2060,6 +2060,30 @@ third_party_inference_model_map = { +@@ -2153,6 +2153,30 @@ third_party_inference_model_map = { is_fc_model=True, - underscore_to_dot=False, + underscore_to_dot=True, ), + "ovms-model": ModelConfig( + model_name="ovms-model", @@ -34,57 +34,69 @@ index 73731c0..b6bbf48 100644 diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py -index 8665234..c224681 100644 +index 357584f..e45e12c 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py -@@ -23,7 +23,7 @@ class OpenAICompletionsHandler(BaseHandler): - def __init__(self, model_name, temperature) -> None: - super().__init__(model_name, temperature) - self.model_style = ModelStyle.OpenAI_Completions -- self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -+ self.client = OpenAI(base_url=os.getenv("OPENAI_BASE_URL","http://localhost:8000/v3"), api_key=os.getenv("OPENAI_API_KEY", "not_used"),timeout=os.getenv("OPENAI_TIMEOUT", 3600)) +@@ -38,10 +38,10 @@ class OpenAICompletionsHandler(BaseHandler): - def decode_ast(self, result, language="Python"): - if "FC" in self.model_name or self.is_fc_model: -@@ -61,6 +61,9 @@ class OpenAICompletionsHandler(BaseHandler): + kwargs = {} + +- if api_key := os.getenv("OPENAI_API_KEY"): ++ if api_key := os.getenv("OPENAI_API_KEY","unused"): + kwargs["api_key"] = api_key + +- if base_url := os.getenv("OPENAI_BASE_URL"): ++ if base_url := os.getenv("OPENAI_BASE_URL","http://localhost:8000/v3"): + kwargs["base_url"] = base_url + + if headers_env := os.getenv("OPENAI_DEFAULT_HEADERS"): +@@ -85,6 +85,9 @@ class OpenAICompletionsHandler(BaseHandler): "messages": message, - "model": self.model_name.replace("-FC", ""), + "model": self.model_name, "temperature": self.temperature, -+ "tool_choice": os.getenv("TOOL_CHOICE", "auto"), -+ "extra_body": {"chat_template_kwargs": {"enable_thinking": bool(os.getenv("ENABLE_THINKING", ""))}}, + "max_completion_tokens": 2048, ++ "tool_choice": os.getenv("TOOL_CHOICE", "auto"), ++ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, "store": False, } diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py -index 9ce4e7d..06ec74e 100644 +index 10f1a08..50890c7 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py -@@ -21,8 +21,8 @@ class QwenAPIHandler(OpenAICompletionsHandler): - super().__init__(model_name, temperature) - self.model_style = ModelStyle.OpenAI_Completions +@@ -7,6 +7,7 @@ from openai import OpenAI + from overrides import override + from qwen_agent.llm import get_chat_model + import time ++import json + + class QwenAPIHandler(OpenAICompletionsHandler): + """ +@@ -28,8 +29,8 @@ class QwenAPIHandler(OpenAICompletionsHandler): + super().__init__(model_name, temperature, registry_name, is_fc_model, **kwargs) + self.model_style = ModelStyle.OPENAI_COMPLETIONS self.client = OpenAI( - base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", - api_key=os.getenv("QWEN_API_KEY"), + base_url=os.getenv("OPENAI_BASE_URL", "https://localhost:8000/v3"), -+ api_key=os.getenv("QWEN_API_KEY", "not_used"), ++ api_key=os.getenv("QWEN_API_KEY","unused"), ) #### FC methods #### -@@ -38,9 +38,9 @@ class QwenAPIHandler(OpenAICompletionsHandler): +@@ -45,9 +46,9 @@ class QwenAPIHandler(OpenAICompletionsHandler): model=self.model_name.replace("-FC", ""), tools=tools, parallel_tool_calls=True, - extra_body={ - "enable_thinking": True - }, -+ extra_body={ "chat_template_kwargs": { -+ "enable_thinking": bool(os.getenv("ENABLE_THINKING", "")) -+ }}, ++ max_completion_tokens=2048, ++ tool_choice=os.getenv("TOOL_CHOICE", "auto"), ++ extra_body={"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, stream=True, stream_options={ "include_usage": True -@@ -338,4 +338,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler): +@@ -352,4 +353,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler): 'timeout': 1000, 'max_tokens': 16384 } From a11a441c5ab886ca442f75c87e066fd8750c87fb Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 13 Jan 2026 16:01:19 +0100 Subject: [PATCH 2/5] updated scripts to test accuracy --- .../chat_template_devstral.jinja | 107 ++++++++++++++++++ ...est_all_models.sh => export_all_models.sh} | 84 ++------------ tests/accuracy/install_gorilla.sh | 29 +++++ tests/accuracy/test_case_ids_to_generate.json | 4 + tests/accuracy/test_single_model.sh | 22 ++++ tests/accuracy/test_small_models.sh | 86 ++++++++++++++ 6 files changed, 258 insertions(+), 74 deletions(-) create mode 100644 extras/chat_template_examples/chat_template_devstral.jinja rename tests/accuracy/{test_all_models.sh => export_all_models.sh} (77%) mode change 100755 => 100644 create mode 100644 tests/accuracy/install_gorilla.sh create mode 100644 tests/accuracy/test_case_ids_to_generate.json create mode 100644 tests/accuracy/test_single_model.sh create mode 100755 tests/accuracy/test_small_models.sh diff --git a/extras/chat_template_examples/chat_template_devstral.jinja b/extras/chat_template_examples/chat_template_devstral.jinja new file mode 100644 index 0000000000..19c1fa1c3e --- /dev/null +++ b/extras/chat_template_examples/chat_template_devstral.jinja @@ -0,0 +1,107 @@ +{#- Copyright 2026-present the Intel team. All rights reserved. #} +{#- Licensed under the Apache License, Version 2.0 (the "License") #} +{#- Edits made by Unsloth and Intel #} +{%- set default_system_message = 'You are Devstral, a helpful agentic model trained by Mistral AI and using the OpenHands scaffold. You can interact with a computer to solve tasks.\n\n\nYour primary role is to assist users by executing commands, modifying code, and solving technical problems effectively. You should be thorough, methodical, and prioritize quality over speed.\n* If the user asks a question, like \"why is X happening\", don\'t try to fix the problem. Just give an answer to the question.\n\n\n\n* Each action you take is somewhat expensive. Wherever possible, combine multiple actions into a single action, e.g. combine multiple bash commands into one, using sed and grep to edit/view multiple files at once.\n* When exploring the codebase, use efficient tools like find, grep, and git commands with appropriate filters to minimize unnecessary operations.\n\n\n\n* When a user provides a file path, do NOT assume it\'s relative to the current working directory. First explore the file system to locate the file before working on it.\n* If asked to edit a file, edit the file directly, rather than creating a new file with a different filename.\n* For global search-and-replace operations, consider using `sed` instead of opening file editors multiple times.\n\n\n\n* Write clean, efficient code with minimal comments. Avoid redundancy in comments: Do not repeat information that can be easily inferred from the code itself.\n* When implementing solutions, focus on making the minimal changes needed to solve the problem.\n* Before implementing any changes, first thoroughly understand the codebase through exploration.\n* If you are adding a lot of code to a function or file, consider splitting the function or file into smaller pieces when appropriate.\n\n\n\n* When configuring git credentials, use \"openhands\" as the user.name and \"openhands@all-hands.dev\" as the user.email by default, unless explicitly instructed otherwise.\n* Exercise caution with git operations. Do NOT make potentially dangerous changes (e.g., pushing to main, deleting repositories) unless explicitly asked to do so.\n* When committing changes, use `git status` to see all modified files, and stage all files necessary for the commit. Use `git commit -a` whenever possible.\n* Do NOT commit files that typically shouldn\'t go into version control (e.g., node_modules/, .env files, build directories, cache files, large binaries) unless explicitly instructed by the user.\n* If unsure about committing certain files, check for the presence of .gitignore files or ask the user for clarification.\n\n\n\n* When creating pull requests, create only ONE per session/issue unless explicitly instructed otherwise.\n* When working with an existing PR, update it with new commits rather than creating additional PRs for the same issue.\n* When updating a PR, preserve the original PR title and purpose, updating description only when necessary.\n\n\n\n1. EXPLORATION: Thoroughly explore relevant files and understand the context before proposing solutions\n2. ANALYSIS: Consider multiple approaches and select the most promising one\n3. TESTING:\n * For bug fixes: Create tests to verify issues before implementing fixes\n * For new features: Consider test-driven development when appropriate\n * If the repository lacks testing infrastructure and implementing tests would require extensive setup, consult with the user before investing time in building testing infrastructure\n * If the environment is not set up to run tests, consult with the user first before investing time to install all dependencies\n4. IMPLEMENTATION: Make focused, minimal changes to address the problem\n5. VERIFICATION: If the environment is set up to run tests, test your implementation thoroughly, including edge cases. If the environment is not set up to run tests, consult with the user first before investing time to run tests.\n\n\n\n* Only use GITHUB_TOKEN and other credentials in ways the user has explicitly requested and would expect.\n* Use APIs to work with GitHub or other platforms, unless the user asks otherwise or your task requires browsing.\n\n\n\n* When user asks you to run an application, don\'t stop if the application is not installed. Instead, please install the application and run the command again.\n* If you encounter missing dependencies:\n 1. First, look around in the repository for existing dependency files (requirements.txt, pyproject.toml, package.json, Gemfile, etc.)\n 2. If dependency files exist, use them to install all dependencies at once (e.g., `pip install -r requirements.txt`, `npm install`, etc.)\n 3. Only install individual packages directly if no dependency files are found or if only specific packages are needed\n* Similarly, if you encounter missing dependencies for essential tools requested by the user, install them when possible.\n\n\n\n* If you\'ve made repeated attempts to solve a problem but tests still fail or the user reports it\'s still broken:\n 1. Step back and reflect on 5-7 different possible sources of the problem\n 2. Assess the likelihood of each possible cause\n 3. Methodically address the most likely causes, starting with the highest probability\n 4. Document your reasoning process\n* When you run into any major issue while executing a plan from the user, please don\'t try to directly work around it. Instead, propose a new plan and confirm with the user before proceeding.\n' %} + +{{- bos_token }} + +\nWhen generating the tool call, follow the format: [TOOL_CALLS]tool_name[ARGS]arguments as json\n\n + +{%- if messages[0]['role'] == 'system' %} + {%- if messages[0]['content'] is string %} + {%- set system_message = messages[0]['content'] %} + {%- else %} + {%- set system_message = messages[0]['content'][0]['text'] %} + {%- endif %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set system_message = default_system_message %} + {%- set loop_messages = messages %} +{%- endif %} +{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }} + + +{#- Tool description appended ONLY to last user message. Edits made by Unsloth #} +{#- Tool description appended also if last message is tool. Edits made by Unsloth #} +{%- set tools_description = "" %} +{%- set has_tools = false %} + +{%- if tools is defined and tools is not none and tools|length > 0 %} + + {%- set has_tools = true %} + {%- set tools_description = "[AVAILABLE_TOOLS]" + (tools | tojson) + "[/AVAILABLE_TOOLS]" %} + + {{- tools_description }} + +{%- endif %} + +{%- for message in loop_messages %} + {%- if message['role'] == 'user' %} + + {%- if message['content'] is string %} + {{- '[INST]' + message['content'] + '[/INST]' }} + {%- else %} + {{- '[INST]' }} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + + {#- Original did not have content which is weird. Added by Un-sloth. #} + {%- if block['text'] is defined %} + {{- block['text'] }} + {%- else %} + {{- block['content'] }} + {%- endif %} + + {%- elif block['type'] in ['image', 'image_url'] %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text and image blocks are supported in message content!') }} + {%- endif %} + {%- endfor %} + {{- '[/INST]' }} + {%- endif %} + + {%- elif message['role'] == 'system' %} + {%- if message['content'] is string %} + {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }} + {%- else %} + {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }} + {%- endif %} + + + {%- elif message['role'] == 'assistant' %} + {%- if message['content'] is string %} + {{- message['content'] }} + {%- else %} + {{- message['content'][0]['text'] }} + {%- endif %} + + {#- If User,Assistant,Tool,Tool we also need to append tools_description. Edits made by Unsloth #} + + {%- if message['tool_calls'] is defined and message['tool_calls'] is not none %} + {%- for tool in message['tool_calls'] %} + {%- set arguments = tool['function']['arguments'] %} + {%- if arguments is not string %} + {%- set arguments = arguments|tojson %} + {%- endif %} + {#- Must list tool calls AFTER assistant. Edits made by Un-sloth #} + {{- "[TOOL_CALLS]" + tool['function']['name'] + "[ARGS]" + arguments }} + {%- endfor %} + {%- endif %} + + {{- eos_token }} + + {%- elif message["role"] == "tool_results" or message["role"] == "tool" %} + {%- if message.content is defined and message.content.content is defined %} + {%- set content = message.content.content %} + {%- else %} + {%- set content = message.content %} + {%- endif %} + {{- "[TOOL_RESULTS]" + content|string + "[/TOOL_RESULTS]" }} + + {%- else %} + {{- raise_exception('Only user, systemm assistant and tool roles are supported in the custom template made by Unsloth!') }} + {%- endif %} +{%- endfor %} +{#- Copyright 2025-present the Unsloth team. All rights reserved. #} +{#- Licensed under the Apache License, Version 2.0 (the "License") #} diff --git a/tests/accuracy/test_all_models.sh b/tests/accuracy/export_all_models.sh old mode 100755 new mode 100644 similarity index 77% rename from tests/accuracy/test_all_models.sh rename to tests/accuracy/export_all_models.sh index 1ef70abf1e..6beb4a57bb --- a/tests/accuracy/test_all_models.sh +++ b/tests/accuracy/export_all_models.sh @@ -1,6 +1,6 @@ #!/bin/bash -x # -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2026 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,13 +18,6 @@ BRANCH_NAME=$(git rev-parse --abbrev-ref HEAD) # install dependencies pip install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/${BRANCH_NAME}/demos/common/export_models/requirements.txt -rm -rf gorilla -git clone https://github.com/ShishirPatil/gorilla -cd gorilla/berkeley-function-call-leaderboard -git checkout cd9429ccf3d4d04156affe883c495b3b047e6b64 -curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/${BRANCH_NAME}/demos/continuous_batching/accuracy/gorilla.patch | git apply -v -pip install -e . -cd ../.. curl -L -O https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/${BRANCH_NAME}/demos/common/export_models/export_model.py mkdir -p models @@ -79,71 +72,14 @@ python export_model.py text_generation --source_model mistralai/Mistral-7B-Instr curl -L -o models/mistralai/Mistral-7B-Instruct-v0.3-int8/chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_mistral_parallel.jinja python export_model.py text_generation --source_model mistralai/Mistral-7B-Instruct-v0.3 --model_name mistralai/Mistral-7B-Instruct-v0.3-fp16 --weight-format fp16 --config_file_path models/config.json --model_repository_path models --tool_parser mistral --cache_size 2 --extra_quantization_params "--task text-generation-with-past" curl -L -o models/mistralai/Mistral-7B-Instruct-v0.3-fp16/chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.9.0/examples/tool_chat_template_mistral_parallel.jinja +# openai/gpt-oss-20b +python export_model.py text_generation --source_model openai/gpt-oss-20b --model_name openai/gpt-oss-20b-int4 --weight-format int4 --config_file_path models/config.json --model_repository_path models +cp ../extras/chat_template_examples/chat_template_gpt_oss.jinja models/openai/gpt-oss-20b-int4/chat_template.jinja -run_model_test() { - local model_name=$1 - local precision=$2 - local tool_parser=$3 - local enable_tool_guided_generation=${4:-false} - set -x - docker stop ovms 2>/dev/null - docker run -d --name ovms --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:latest \ - --rest_port 8000 --model_repository_path /models --source_model ${model_name}-${precision} \ - --tool_parser ${tool_parser} --model_name ovms-model --enable_tool_guided_generation $enable_tool_guided_generation \ - --cache_size 20 --task text_generation - - sleep 20 - - local result_dir="${model_name}-${precision}${enable_tool_guided_generation}" - # use short model name - result_dir=$(echo "$result_dir" | awk -F'/' '{print $NF}') - echo "Result directory: $result_dir" - sleep 10 - export OPENAI_BASE_URL=http://localhost:8000/v3 - export OPENAI_API_KEY="notused" - export TOOL_CHOICE=auto - bfcl generate --model ovms-model --test-category simple,multiple,parallel,irrelevance,multi_turn_base --num-threads 100 --result-dir $result_dir -o - bfcl generate --model ovms-model --test-category multi_turn_base --num-threads 10 --result-dir $result_dir -o - bfcl evaluate --model ovms-model --result-dir $result_dir --score-dir ${result_dir}_score -} - -# Model configurations -declare -A models=( - ["Qwen/Qwen3-8B"]="hermes3" - ["Qwen/Qwen3-4B"]="hermes3" - ["Qwen/Qwen3-1.7B"]="hermes3" - ["Qwen/Qwen3-0.6B"]="hermes3" - ["meta-llama/Llama-3.1-8B-Instruct"]="llama3" - ["meta-llama/Llama-3.2-3B-Instruct"]="llama3" - ["NousResearch/Hermes-3-Llama-3.1-8B"]="hermes3" - ["microsoft/Phi-4-mini-instruct"]="phi4" - ["mistralai/Mistral-7B-Instruct-v0.3"]="mistral" -) - -precisions=("int4" "int8" "fp16") - -# Run tests for each model and precision -# enable tool guided generation -for model in "${!models[@]}"; do - tool_parser="${models[$model]}" - for precision in "${precisions[@]}"; do - run_model_test "$model" "$precision" "$tool_parser" "true" - done -done - - -# disable tool guided generation -for model in "${!models[@]}"; do - tool_parser="${models[$model]}" - for precision in "${precisions[@]}"; do - run_model_test "$model" "$precision" "$tool_parser" "false" - done -done - -docker stop ovms 2>/dev/null - - -python sumarize_results.py - - +# Qwen/Qwen3-Coder-30B-Instruct +python export_model.py text_generation --source_model Qwen/Qwen3-Coder-30B-Instruct --model_name Qwen/Qwen3-Coder-30B-Instruct-int4 --weight-format int4 --config_file_path models/config.json --model_repository_path models +cp ../extras/chat_template_examples/chat_template_qwen3coder_instruct.jinja models/Qwen/Qwen3-Coder-30B-Instruct-int4/chat_template.jinja +# devstral +python export_model.py text_generation --source_model unsloth/Devstral-Small-2507 --model_name unsloth/Devstral-Small-2507-int4 --weight-format int4 --config_file_path models/config.json --model_repository_path models +cp ../extras/chat_template_examples/chat_template_devstral.jinja models/unsloth/Devstral-Small-2507-int4/chat_template.jinja \ No newline at end of file diff --git a/tests/accuracy/install_gorilla.sh b/tests/accuracy/install_gorilla.sh new file mode 100644 index 0000000000..1093c69d3f --- /dev/null +++ b/tests/accuracy/install_gorilla.sh @@ -0,0 +1,29 @@ +#!/bin/bash -x +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +BRANCH_NAME=$(git rev-parse --abbrev-ref HEAD) +# install dependencies +# if gorilla is already installed, it will skip installation +if [ ! -d "gorilla" ]; then + git clone https://github.com/ShishirPatil/gorilla + cd gorilla/berkeley-function-call-leaderboard + git checkout 9b8a5202544f49a846aced185a340361231ef3e1 + curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/${BRANCH_NAME}/demos/continuous_batching/accuracy/gorilla.patch | git apply -v + pip install -e . + cd ../.. + cp test_case_ids_to_generate.json gorilla/berkeley-function-call-leaderboard/ +fi \ No newline at end of file diff --git a/tests/accuracy/test_case_ids_to_generate.json b/tests/accuracy/test_case_ids_to_generate.json new file mode 100644 index 0000000000..efd329dc53 --- /dev/null +++ b/tests/accuracy/test_case_ids_to_generate.json @@ -0,0 +1,4 @@ +{ + "simple_python": ["simple_python_101","simple_python_102", "simple_python_103","simple_python_104","simple_python_105","simple_python_106","simple_python_107","simple_python_108","simple_python_109","simple_python_110"], + "multi_turn_base": ["multi_turn_base_190", "multi_turn_base_191","multi_turn_base_192","multi_turn_base_193","multi_turn_base_194","multi_turn_base_195","multi_turn_base_196","multi_turn_base_197","multi_turn_base_198","multi_turn_base_199"] +} \ No newline at end of file diff --git a/tests/accuracy/test_single_model.sh b/tests/accuracy/test_single_model.sh new file mode 100644 index 0000000000..e9d71088d2 --- /dev/null +++ b/tests/accuracy/test_single_model.sh @@ -0,0 +1,22 @@ +export MODEL=$1 +export PRECISION=$2 + + +docker stop ovms 2>/dev/null +docker run -d --name ovms --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:latest \ +--rest_port 8000 --model_repository_path /models --source_model ${model_name}-${precision} \ +--tool_parser ${tool_parser} --model_name ovms-model \ +--cache_size 0 --task text_generation + +echo wait for model server to be ready +while [ "$(curl -s http://localhost:8000/v3/models | jq -r '.data[0].id')" != "${model_name}-${precision}" ] ; do echo waiting for LLM model; sleep 1; done +echo Server is ready + +export result_dir="${MODEL}-${PRECISION}" +# use short model name +result_dir=$(echo "$result_dir" | awk -F'/' '{print $NF}') +echo "Result directory: $result_dir" +export OPENAI_BASE_URL=http://localhost:8000/v3 + +bfcl generate --model ovms-model --run-ids --result-dir $result_dir -o +bfcl evaluate --model ovms-model --result-dir $result_dir --score-dir ${result_dir}_score --partial-eval diff --git a/tests/accuracy/test_small_models.sh b/tests/accuracy/test_small_models.sh new file mode 100755 index 0000000000..9b6bd13cf8 --- /dev/null +++ b/tests/accuracy/test_small_models.sh @@ -0,0 +1,86 @@ +#!/bin/bash -x +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +run_model_test() { + local model_name=$1 + local precision=$2 + local tool_parser=$3 + local enable_tool_guided_generation=${4:-false} + set -x + docker stop ovms 2>/dev/null + docker run -d --name ovms --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:latest \ + --rest_port 8000 --model_repository_path /models --source_model ${model_name}-${precision} \ + --tool_parser ${tool_parser} --model_name ovms-model --enable_tool_guided_generation $enable_tool_guided_generation \ + --cache_size 0 --task text_generation + + echo wait for model server to be ready + while [ "$(curl -s http://localhost:8000/v3/models | jq -r '.data[0].id')" != "${model_name}-${precision}" ] ; do echo waiting for LLM model; sleep 1; done + echo Server is ready + + local result_dir="${model_name}-${precision}${enable_tool_guided_generation}" + # use short model name + result_dir=$(echo "$result_dir" | awk -F'/' '{print $NF}') + echo "Result directory: $result_dir" + sleep 10 + export OPENAI_BASE_URL=http://localhost:8000/v3 + export OPENAI_API_KEY="notused" + export TOOL_CHOICE=auto + bfcl generate --model ovms-model --test-category simple_python,multiple,parallel,irrelevance,multi_turn_base --num-threads 100 --result-dir $result_dir -o + bfcl generate --model ovms-model --test-category multi_turn_base --num-threads 10 --result-dir $result_dir -o + bfcl evaluate --model ovms-model --result-dir $result_dir --score-dir ${result_dir}_score +} + +# Model configurations +declare -A models=( + ["Qwen/Qwen3-8B"]="hermes3" + ["Qwen/Qwen3-4B"]="hermes3" + ["Qwen/Qwen3-1.7B"]="hermes3" + ["Qwen/Qwen3-0.6B"]="hermes3" + ["meta-llama/Llama-3.1-8B-Instruct"]="llama3" + ["meta-llama/Llama-3.2-3B-Instruct"]="llama3" + ["NousResearch/Hermes-3-Llama-3.1-8B"]="hermes3" + ["microsoft/Phi-4-mini-instruct"]="phi4" + ["mistralai/Mistral-7B-Instruct-v0.3"]="mistral" +) + +precisions=("int4" "int8" "fp16") + +# Run tests for each model and precision +# enable tool guided generation +for model in "${!models[@]}"; do + tool_parser="${models[$model]}" + for precision in "${precisions[@]}"; do + run_model_test "$model" "$precision" "$tool_parser" "true" + done +done + + +# disable tool guided generation +for model in "${!models[@]}"; do + tool_parser="${models[$model]}" + for precision in "${precisions[@]}"; do + run_model_test "$model" "$precision" "$tool_parser" "false" + done +done + +docker stop ovms 2>/dev/null + + +python sumarize_results.py + + + From 6ed3d21a2f56bbc7b9181291d8cbdec5d5f37d1a Mon Sep 17 00:00:00 2001 From: "Trawinski, Dariusz" Date: Tue, 13 Jan 2026 16:19:49 +0100 Subject: [PATCH 3/5] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- extras/chat_template_examples/chat_template_devstral.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extras/chat_template_examples/chat_template_devstral.jinja b/extras/chat_template_examples/chat_template_devstral.jinja index 19c1fa1c3e..b01885a3e0 100644 --- a/extras/chat_template_examples/chat_template_devstral.jinja +++ b/extras/chat_template_examples/chat_template_devstral.jinja @@ -100,7 +100,7 @@ {{- "[TOOL_RESULTS]" + content|string + "[/TOOL_RESULTS]" }} {%- else %} - {{- raise_exception('Only user, systemm assistant and tool roles are supported in the custom template made by Unsloth!') }} + {{- raise_exception('Only user, system, assistant and tool roles are supported in the custom template made by Unsloth!') }} {%- endif %} {%- endfor %} {#- Copyright 2025-present the Unsloth team. All rights reserved. #} From b258d302eedef354dcb5cedcac99bb96ed56b39d Mon Sep 17 00:00:00 2001 From: "Trawinski, Dariusz" Date: Tue, 13 Jan 2026 16:21:07 +0100 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/accuracy/test_single_model.sh | 7 ++++--- tests/accuracy/test_small_models.sh | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/accuracy/test_single_model.sh b/tests/accuracy/test_single_model.sh index e9d71088d2..00a85c9a70 100644 --- a/tests/accuracy/test_single_model.sh +++ b/tests/accuracy/test_single_model.sh @@ -1,15 +1,16 @@ export MODEL=$1 export PRECISION=$2 +export TOOL_PARSER=$3 docker stop ovms 2>/dev/null docker run -d --name ovms --user $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models openvino/model_server:latest \ ---rest_port 8000 --model_repository_path /models --source_model ${model_name}-${precision} \ ---tool_parser ${tool_parser} --model_name ovms-model \ +--rest_port 8000 --model_repository_path /models --source_model ${MODEL}-${PRECISION} \ +--tool_parser ${TOOL_PARSER} --model_name ovms-model \ --cache_size 0 --task text_generation echo wait for model server to be ready -while [ "$(curl -s http://localhost:8000/v3/models | jq -r '.data[0].id')" != "${model_name}-${precision}" ] ; do echo waiting for LLM model; sleep 1; done +while [ "$(curl -s http://localhost:8000/v3/models | jq -r '.data[0].id')" != "${MODEL}-${PRECISION}" ] ; do echo waiting for LLM model; sleep 1; done echo Server is ready export result_dir="${MODEL}-${PRECISION}" diff --git a/tests/accuracy/test_small_models.sh b/tests/accuracy/test_small_models.sh index 9b6bd13cf8..919f28c724 100755 --- a/tests/accuracy/test_small_models.sh +++ b/tests/accuracy/test_small_models.sh @@ -80,7 +80,7 @@ done docker stop ovms 2>/dev/null -python sumarize_results.py +python summarize_results.py From 98b50ccf351bd557e152109048b487b65b6bb4e0 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 13 Jan 2026 16:50:08 +0100 Subject: [PATCH 5/5] updates --- tests/accuracy/export_all_models.sh | 0 tests/accuracy/install_gorilla.sh | 7 +++++-- ...sumarize_results.py => summarize_results.py} | 0 tests/accuracy/test_single_model.sh | 17 +++++++++++++++++ tests/accuracy/test_small_models.sh | 2 +- 5 files changed, 23 insertions(+), 3 deletions(-) mode change 100644 => 100755 tests/accuracy/export_all_models.sh mode change 100644 => 100755 tests/accuracy/install_gorilla.sh rename tests/accuracy/{sumarize_results.py => summarize_results.py} (100%) mode change 100644 => 100755 tests/accuracy/test_single_model.sh diff --git a/tests/accuracy/export_all_models.sh b/tests/accuracy/export_all_models.sh old mode 100644 new mode 100755 diff --git a/tests/accuracy/install_gorilla.sh b/tests/accuracy/install_gorilla.sh old mode 100644 new mode 100755 index 1093c69d3f..de4740665a --- a/tests/accuracy/install_gorilla.sh +++ b/tests/accuracy/install_gorilla.sh @@ -1,6 +1,6 @@ #!/bin/bash -x # -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2026 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,7 +23,10 @@ if [ ! -d "gorilla" ]; then cd gorilla/berkeley-function-call-leaderboard git checkout 9b8a5202544f49a846aced185a340361231ef3e1 curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/${BRANCH_NAME}/demos/continuous_batching/accuracy/gorilla.patch | git apply -v - pip install -e . + pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu" + bfcl --help cd ../.. cp test_case_ids_to_generate.json gorilla/berkeley-function-call-leaderboard/ +else + echo "Gorilla already installed, skipping installation. Delete the 'gorilla' directory to reinstall." fi \ No newline at end of file diff --git a/tests/accuracy/sumarize_results.py b/tests/accuracy/summarize_results.py similarity index 100% rename from tests/accuracy/sumarize_results.py rename to tests/accuracy/summarize_results.py diff --git a/tests/accuracy/test_single_model.sh b/tests/accuracy/test_single_model.sh old mode 100644 new mode 100755 index e9d71088d2..a3d1d10263 --- a/tests/accuracy/test_single_model.sh +++ b/tests/accuracy/test_single_model.sh @@ -1,3 +1,20 @@ +#!/bin/bash -x +# +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + export MODEL=$1 export PRECISION=$2 diff --git a/tests/accuracy/test_small_models.sh b/tests/accuracy/test_small_models.sh index 9b6bd13cf8..cbad3fa8ef 100755 --- a/tests/accuracy/test_small_models.sh +++ b/tests/accuracy/test_small_models.sh @@ -1,6 +1,6 @@ #!/bin/bash -x # -# Copyright (c) 2024 Intel Corporation +# Copyright (c) 2026 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.