openvinotoolkit · dtrawins · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026 · dkalinowski
diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md
@@ -34,7 +34,7 @@ docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
     --model_name Qwen/Qwen3-Coder-30B-A3B-Instruct \
     --model_path Qwen/Qwen3-Coder-30B-A3B-Instruct
 ```
-> **Note:** This model requires ~150GB disk space and 60GB RAM for conversion. For deployment the model require ~16GB disk space and same amount of VRAM on the GPU.
+> **Note:** For deployment, the model requires ~16GB disk space and recommended 16GB+ of VRAM on the GPU. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
 
-> **Note:** For deployment, the model requires ~16GB disk space and recommended 16GB+ of VRAM on the GPU. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
+> **Note:** For deployment, the model requires ~16GB disk space and recommended 16GB+ of VRAM on the GPU. For conversion, the original model will be pulled and quantization will be applied. It requires the amount of RAM equal to the model size <how much?>
+
-> **Note:** For deployment, the model requires ~16GB disk space and recommended 16GB+ of VRAM on the GPU. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
+> **Note:** For deployment, the model requires ~16GB disk space and recommended 16GB+ of VRAM on the GPU. For conversion, the original model will be pulled and quantization will be applied. It requires the amount of RAM equal to the model size <how much?>
+
 :::
 :::{tab-item} mistralai/Codestral-22B-v0.1 
@@ -56,7 +56,7 @@ docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
 :::{tab-item} openai/gpt-oss-20b
 :sync: openai/gpt-oss-20b
 ```bash
-python export_model.py text_generation --source_model openai/gpt-oss-20b --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --overwrite_models
+python export_model.py text_generation --source_model openai/gpt-oss-20b --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --tool_parser gptoss --reasoning_parser gptoss --target_device GPU --overwrite_models
 curl -L -o models/openai/gpt-oss-20b/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/extras/chat_template_examples/chat_template_gpt_oss.jinja
 
 docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
@@ -68,16 +68,35 @@ docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
 ```
 > **Note:** This model requires ~13GB disk space and same amount of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
 
+:::
+:::{tab-item} unsloth/Devstral-Small-2507
+:sync: unsloth/Devstral-Small-2507
+```bash
+python export_model.py text_generation --source_model unsloth/Devstral-Small-2507 --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --tool_parser devstral --target_device GPU --overwrite_models
+curl -L -o models/unsloth/Devstral-Small-2507/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/extras/chat_template_examples/chat_template_devstral.jinja
+
+docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
+    openvino/model_server:weekly \
+    --add_to_config \
+    --config_path /models/config_all.json \
+    --model_name unsloth/Devstral-Small-2507 \
+    --model_path unsloth/Devstral-Small-2507
+```
+> **Note:** This model requires ~13GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
-> **Note:** This model requires ~13GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
+> **Note:** This model requires ~13GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will be applied. It requires the amount of RAM equal to the model size <how much?>
-> **Note:** This model requires ~13GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
+> **Note:** This model requires ~13GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will be applied. It requires the amount of RAM equal to the model size <how much?>
+
 :::
 :::{tab-item} OpenVINO/Qwen3-8B-int4-ov
 :sync: OpenVINO/Qwen3-8B-int4-ov
 ```bash
 docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
+    openvino/model_server:weekly \
     --pull \
     --source_model OpenVINO/Qwen3-8B-int4-ov \
     --model_repository_path /models \
     --model_name OpenVINO/Qwen3-8B-int4-ov \
-    --task text_generation
+    --task text_generation \
+    --tool_parser hermes3 \
+    --target_device GPU
 
 docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
     openvino/model_server:weekly \
@@ -95,8 +114,10 @@ docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
     --source_model OpenVINO/Qwen3-4B-int4-ov \
     --model_repository_path /models \
     --model_name OpenVINO/Qwen3-4B-int4-ov \
-    --task text_generation
-
+    --task text_generation \
+    --tool_parser hermes3 \
+    --target_device GPU
+
 docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
     openvino/model_server:weekly \
     --add_to_config --config_path /models/config_all.json \
@@ -108,11 +129,13 @@ docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
 :sync: OpenVINO/Qwen2.5-Coder-3B-Instruct-int4-ov
 ```bash
 docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
+    openvino/model_server:weekly \
     --pull \
     --source_model OpenVINO/Qwen2.5-Coder-3B-Instruct-int4-ov \
     --model_repository_path /models \
     --model_name OpenVINO/Qwen2.5-Coder-3B-Instruct-int4-ov \
-    --task text_generation
+    --task text_generation \
+    --target_device GPU
 
 docker run -d --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw \
     openvino/model_server:weekly \
@@ -132,49 +155,61 @@ Pull and add the model on Windows:
 :::{tab-item} Qwen/Qwen3-Coder-30B-A3B-Instruct
 :sync: Qwen/Qwen3-Coder-30B-A3B-Instruct
 ```bat
-python export_model.py text_generation --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --weight-format int8 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --tool_parser qwen3coder --overwrite_models
+python export_model.py text_generation --source_model Qwen/Qwen3-Coder-30B-A3B-Instruct --weight-format int8 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --tool_parser qwen3coder
 curl -L -o models/Qwen/Qwen3-Coder-30B-A3B-Instruct/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/extras/chat_template_examples/chat_template_qwen3coder_instruct.jinja
 
 ovms.exe --add_to_config --config_path models/config_all.json --model_name Qwen/Qwen3-Coder-30B-A3B-Instruct --model_path Qwen/Qwen3-Coder-30B-A3B-Instruct
 ```
-> **Note:** This model requires ~16GB disk space and same amount of VRAM on the GPU.
+> **Note:** This model requires ~16GB disk space and recommended 19GB+ VRAM on the GPU.
 
 :::
 :::{tab-item} mistralai/Codestral-22B-v0.1 
 :sync: mistralai/Codestral-22B-v0.1
 ```bat
-python export_model.py text_generation --source_model mistralai/Codestral-22B-v0.1 --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --overwrite_models
+python export_model.py text_generation --source_model mistralai/Codestral-22B-v0.1 --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU
 curl -L -o models/mistralai/Codestral-22B-v0.1/chat_template.jinja https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.10.1.1/examples/tool_chat_template_mistral_parallel.jinja
 
 ovms.exe --add_to_config --config_path models/config_all.json --model_name mistralai/Codestral-22B-v0.1 --model_path mistralai/Codestral-22B-v0.1
 
 ```
-> **Note:** This model requires ~12GB disk space and same amount of VRAM on the GPU.
+> **Note:** This model requires ~12GB disk space and recommended 16GB+ VRAM on the GPU.
 
 :::
 :::{tab-item} openai/gpt-oss-20b
 :sync: openai/gpt-oss-20b
 ```bat
-python export_model.py text_generation --source_model openai/gpt-oss-20b --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --overwrite_models --pipeline_type LM
+python export_model.py text_generation --source_model openai/gpt-oss-20b --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU
 curl -L -o models/openai/gpt-oss-20b/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/extras/chat_template_examples/chat_template_gpt_oss.jinja
 
 ovms.exe --add_to_config --config_path models/config_all.json --model_name openai/gpt-oss-20b --model_path openai/gpt-oss-20b
 ```
-> **Note:** This model requires ~13GB disk space and same amount of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
+> **Note:** This model requires ~12GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
-> **Note:** This model requires ~12GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
+> **Note:** This model requires ~12GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will be applied. It requires the amount of RAM equal to the model size <how much?>
-> **Note:** This model requires ~12GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
+> **Note:** This model requires ~12GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will be applied. It requires the amount of RAM equal to the model size <how much?>
+> **Note:** While using version 2025.4.*, add `--pipeline_type LM` parameter to the export_model.py. It disables continuous batching. With 2026+ or latest weekly release, it is not required.
+
+:::
+:::{tab-item} unsloth/Devstral-Small-2507
+:sync: unsloth/Devstral-Small-2507
+```bat
+python export_model.py text_generation --source_model unsloth/Devstral-Small-2507 --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --tool_parser devstral --target_device GPU
+curl -L -o models/unsloth/Devstral-Small-2507/chat_template.jinja https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/extras/chat_template_examples/chat_template_devstral.jinja
+
+ovms.exe --add_to_config --config_path models/config_all.json --model_name unsloth/Devstral-Small-2507 --model_path unsloth/Devstral-Small-2507
+```
+> **Note:** This model requires ~13GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
-> **Note:** This model requires ~13GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
+> **Note:** This model requires ~13GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will be applied. It requires the amount of RAM equal to the model size <how much?>
-> **Note:** This model requires ~13GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will require the amount of RAM of the model size.
+> **Note:** This model requires ~13GB disk space and recommended 16GB+ of VRAM on the GPU for deployment. For conversion, the original model will be pulled and quantization will be applied. It requires the amount of RAM equal to the model size <how much?>
 
 :::
 :::{tab-item} OpenVINO/Qwen3-8B-int4-ov
 :sync: OpenVINO/Qwen3-8B-int4-ov
 ```bat
-ovms.exe --pull --source_model OpenVINO/Qwen3-8B-int4-ov --model_repository_path models --model_name OpenVINO/Qwen3-8B-int4-ov --target_device GPU --task text_generation
+ovms.exe --pull --source_model OpenVINO/Qwen3-8B-int4-ov --model_repository_path models --model_name OpenVINO/Qwen3-8B-int4-ov --target_device GPU --task text_generation --tool_parser hermes3
 
 ovms.exe --add_to_config --config_path models/config_all.json --model_name OpenVINO/Qwen3-8B-int4-ov --model_path OpenVINO/Qwen3-8B-int4-ov
 ```
 :::
 :::{tab-item} OpenVINO/Qwen3-4B-int4-ov
 :sync: OpenVINO/Qwen3-4B-int4-ov
 ```bat
-ovms.exe --pull --source_model OpenVINO/Qwen3-4B-int4-ov --model_repository_path models --model_name OpenVINO/Qwen3-4B-int4-ov --target_device GPU --task text_generation
+ovms.exe --pull --source_model OpenVINO/Qwen3-4B-int4-ov --model_repository_path models --model_name OpenVINO/Qwen3-4B-int4-ov --target_device GPU --task text_generation --tool_parser hermes3
 
 ovms.exe --add_to_config --config_path models/config_all.json --model_name OpenVINO/Qwen3-4B-int4-ov --model_path OpenVINO/Qwen3-4B-int4-ov
 ```
@@ -210,15 +245,15 @@ ovms --rest_port 8000 --config_path ./models/config_all.json
 ### Linux: via Docker with CPU
 ```bash
 docker run -d --rm -u $(id -u):$(id -g) \
-  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:latest --rest_port 8000 --config_path /workspace/models/config_all.json
+  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:weekly --rest_port 8000 --config_path /workspace/models/config_all.json
 ```
 :::
 :::{tab-item} Linux GPU
 :sync: Linux GPU
 ### Linux: via Docker with GPU
 ```bash
 docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
-  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:latest-gpu --rest_port 8000 --config_path /workspace/models/config_all.json
+  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:weekly --rest_port 8000 --config_path /workspace/models/config_all.json
 ```
 :::
 ::::
@@ -369,6 +404,41 @@ context:
   - provider: codebase
 ```
 :::
+:::{tab-item} unsloth/Devstral-Small-2507
+:sync: unsloth/Devstral-Small-2507
+```
+name: Local Assistant
+version: 1.0.0
+schema: v1
+models:
+  - name: OVMS unsloth/Devstral-Small-2507
+    provider: openai
+    model: unsloth/Devstral-Small-2507
+    apiKey: unused
+    apiBase: http://localhost:8000/v3
+    roles:
+      - chat
+      - edit
+      - apply
+      - autocomplete
+    capabilities:
+      - tool_use
+    autocompleteOptions:
+      maxPromptTokens: 500
+      debounceDelay: 124
+      useCache: true
+      onlyMyCode: true
+      modelTimeout: 400
+context:
+  - provider: code
+  - provider: docs
+  - provider: diff
+  - provider: terminal
+  - provider: problems
+  - provider: folder
+  - provider: codebase
+```
+:::
 :::{tab-item} OpenVINO/Qwen3-8B-int4-ov
 :sync: OpenVINO/Qwen3-8B-int4-ov
 ```
@@ -432,7 +502,6 @@ models:
       extraBodyProperties:
         chat_template_kwargs:
           enable_thinking: false
-
     autocompleteOptions:
       maxPromptTokens: 500
       debounceDelay: 124

diff --git a/docs/llm/reference.md b/docs/llm/reference.md
@@ -281,14 +281,15 @@ __Tool parsers:__
 - `llama3`
 - `phi4`
 - `mistral`
+- `devstral`
 - `gptoss`
 - `qwen3coder`
 
 __Reasoning parsers:__
 - `qwen3`
 
 Note that using `tools` might require a chat template other than the original. 
-We recommend using templates from the [vLLM repository](https://github.com/vllm-project/vllm/tree/main/examples) for `hermes3`, `llama3`, `phi4`, `mistral`, `gptoss`, and `qwen3coder` models (if available). Save the selected template as `chat_template.jinja` in the model directory and it will be used instead of the default one. If a template is not available for your model, please refer to the model's documentation or use the default template provided by the model server.
+We recommend using templates from the [vLLM repository](https://github.com/vllm-project/vllm/tree/main/examples) for `hermes3`, `llama3`, `phi4`, `mistral`, `devstral`, `gptoss`, and `qwen3coder` models (if available). Save the selected template as `chat_template.jinja` in the model directory and it will be used instead of the default one. If a template is not available for your model, please refer to the model's documentation or use the default template provided by the model server.
 
 When `tool_parser` is used, it's possible to leverage tool guided generation with `enable_tool_guided_generation` option. That setting pushes the model to generate tool calls that matches the schemas specified in the `tools`.