Skip to content

Commit a070f61

Browse files
author
Ralf Waldukat
committed
Update to llama.cpp 2026-01-01
- Update llama.cpp submodule (2025-08-14 → 2026-01-01) - Remove deprecated KV cache functions (use llama_memory_* instead) - Remove llama_sampler_init_softmax (deprecated) - Add LLAMA_ROPE_TYPE_IMROPE constant - Add llama_flash_attn_type enum (AUTO/DISABLED/ENABLED) - Add llama_params_fit_status enum - Add llama_model_meta_key enum for sampling metadata - Add llama_model_params fields: no_host, no_alloc - Replace llama_context_params.flash_attn bool with flash_attn_type enum - Add 15 new API functions: - llama_max_tensor_buft_overrides - llama_n_ctx_seq - llama_model_n_embd_inp - llama_model_is_hybrid - llama_flash_attn_type_name - llama_model_meta_key_str - llama_adapter_meta_* functions (5) - llama_log_get/set - llama_memory_breakdown_print - Add ggml_log_callback typedef - Disable LLAVA build (CMake incompatibility in upstream mtmd) - Bump version 0.3.16 → 0.4.0 Breaking changes: - flash_attn bool removed, use flash_attn_type enum - KV cache functions removed, use llama_memory_* API Tested with Nemotron-3-Nano-30B hybrid model.
1 parent c37132b commit a070f61

File tree

6 files changed

+248
-315
lines changed

6 files changed

+248
-315
lines changed

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,12 @@ if (LLAMA_BUILD)
153153
add_compile_definitions(GGML_USE_METAL)
154154
endif()
155155

156+
# Set version for mtmd (required by upstream CMakeLists.txt)
157+
if (NOT DEFINED LLAMA_BUILD_NUMBER)
158+
set(LLAMA_BUILD_NUMBER 0)
159+
endif()
160+
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
161+
156162
# Building llava
157163
add_subdirectory(vendor/llama.cpp/tools/mtmd)
158164

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.3.16"
4+
__version__ = "0.4.0"

llama_cpp/llama.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,11 @@ def __init__(
341341
self._logits_all = logits_all if draft_model is None else True
342342
self.context_params.embeddings = embedding # TODO: Rename to embeddings
343343
self.context_params.offload_kqv = offload_kqv
344-
self.context_params.flash_attn = flash_attn
344+
self.context_params.flash_attn_type = (
345+
llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
346+
if flash_attn
347+
else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
348+
)
345349

346350
if op_offload is not None:
347351
self.context_params.op_offload = op_offload
@@ -934,7 +938,8 @@ def generate(
934938

935939
sample_idx += 1
936940
if stopping_criteria is not None and stopping_criteria(
937-
self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
941+
self._input_ids[:sample_idx],
942+
self._scores[sample_idx - self.n_tokens, :],
938943
):
939944
return
940945
tokens_or_none = yield token
@@ -1041,7 +1046,9 @@ def embed(
10411046
data: Union[List[List[float]], List[List[List[float]]]] = []
10421047

10431048
def decode_batch(seq_sizes: List[int]):
1044-
llama_cpp.llama_kv_self_clear(self._ctx.ctx)
1049+
mem = llama_cpp.llama_get_memory(self._ctx.ctx)
1050+
if mem is not None:
1051+
llama_cpp.llama_memory_clear(mem, True)
10451052
self._ctx.decode(self._batch)
10461053
self._batch.reset()
10471054

@@ -1112,7 +1119,9 @@ def decode_batch(seq_sizes: List[int]):
11121119

11131120
output = data[0] if isinstance(input, str) else data
11141121

1115-
llama_cpp.llama_kv_self_clear(self._ctx.ctx)
1122+
mem = llama_cpp.llama_get_memory(self._ctx.ctx)
1123+
if mem is not None:
1124+
llama_cpp.llama_memory_clear(mem, True)
11161125
self.reset()
11171126

11181127
if return_count:
@@ -1157,9 +1166,9 @@ def _create_completion(
11571166
bos_token_id: int = self.token_bos()
11581167
cls_token_id: int = self._model.token_cls()
11591168
sep_token_id: int = self._model.token_sep()
1160-
prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
1161-
middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
1162-
suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
1169+
prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
1170+
middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
1171+
suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
11631172
add_space_prefix: bool = (
11641173
self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
11651174
)
@@ -1315,7 +1324,7 @@ def logit_bias_processor(
13151324
if seed is not None:
13161325
self.set_seed(seed)
13171326
else:
1318-
self.set_seed(random.Random(self._seed).randint(0, 2 ** 32))
1327+
self.set_seed(random.Random(self._seed).randint(0, 2**32))
13191328

13201329
finish_reason = "length"
13211330
multibyte_fix = 0
@@ -2056,7 +2065,10 @@ def create_chat_completion_openai_v1(
20562065
stream = kwargs.get("stream", False) # type: ignore
20572066
assert isinstance(stream, bool)
20582067
if stream:
2059-
return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore
2068+
return (
2069+
ChatCompletionChunk(**chunk)
2070+
for chunk in self.create_chat_completion(*args, **kwargs)
2071+
) # type: ignore
20602072
else:
20612073
return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore
20622074
except ImportError:
@@ -2096,7 +2108,10 @@ def __getstate__(self):
20962108
logits_all=self._logits_all,
20972109
embedding=self.context_params.embeddings,
20982110
offload_kqv=self.context_params.offload_kqv,
2099-
flash_attn=self.context_params.flash_attn,
2111+
flash_attn=(
2112+
self.context_params.flash_attn_type
2113+
== llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
2114+
),
21002115
op_offload=self.context_params.op_offload,
21012116
swa_full=self.context_params.swa_full,
21022117
# Sampling Params
@@ -2318,7 +2333,11 @@ def from_pretrained(
23182333
if additional_files:
23192334
for additonal_file_name in additional_files:
23202335
# find the additional shard file:
2321-
matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
2336+
matching_additional_files = [
2337+
file
2338+
for file in file_list
2339+
if fnmatch.fnmatch(file, additonal_file_name)
2340+
]
23222341

23232342
if len(matching_additional_files) == 0:
23242343
raise ValueError(

0 commit comments

Comments
 (0)