abetlen
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎llama_cpp/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎llama_cpp/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama_cpp/llama.py‎
Lines changed: 30 additions & 11 deletions b/‎llama_cpp/llama.py‎
Lines changed: 30 additions & 11 deletions
@@ -153,6 +153,12 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
+        # Set version for mtmd (required by upstream CMakeLists.txt)
+        if (NOT DEFINED LLAMA_BUILD_NUMBER)
+            set(LLAMA_BUILD_NUMBER 0)
+        endif()
+        set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
+
         # Building llava
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
 
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.16"
+__version__ = "0.4.0"
@@ -341,7 +341,11 @@ def __init__(
         self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
-        self.context_params.flash_attn = flash_attn
+        self.context_params.flash_attn_type = (
+            llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            if flash_attn
+            else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+        )
 
         if op_offload is not None:
             self.context_params.op_offload = op_offload
@@ -934,7 +938,8 @@ def generate(
 
                 sample_idx += 1
                 if stopping_criteria is not None and stopping_criteria(
-                    self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
+                    self._input_ids[:sample_idx],
+                    self._scores[sample_idx - self.n_tokens, :],
                 ):
                     return
                 tokens_or_none = yield token
@@ -1041,7 +1046,9 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+            mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+            if mem is not None:
+                llama_cpp.llama_memory_clear(mem, True)
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1112,7 +1119,9 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+        mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+        if mem is not None:
+            llama_cpp.llama_memory_clear(mem, True)
         self.reset()
 
         if return_count:
@@ -1157,9 +1166,9 @@ def _create_completion(
         bos_token_id: int = self.token_bos()
         cls_token_id: int = self._model.token_cls()
         sep_token_id: int = self._model.token_sep()
-        prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
-        middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
-        suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
+        prefix_token_id: int = 0  # self._model.token_prefix() # TODO: Fix
+        middle_token_id: int = 0  # self._model.token_middle() # TODO: Fix
+        suffix_token_id: int = 0  # self._model.token_suffix() # TODO: Fix
         add_space_prefix: bool = (
             self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
         )
@@ -1315,7 +1324,7 @@ def logit_bias_processor(
         if seed is not None:
             self.set_seed(seed)
         else:
-            self.set_seed(random.Random(self._seed).randint(0, 2 ** 32))
+            self.set_seed(random.Random(self._seed).randint(0, 2**32))
 
         finish_reason = "length"
         multibyte_fix = 0
@@ -2056,7 +2065,10 @@ def create_chat_completion_openai_v1(
             stream = kwargs.get("stream", False)  # type: ignore
             assert isinstance(stream, bool)
             if stream:
-                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
+                return (
+                    ChatCompletionChunk(**chunk)
+                    for chunk in self.create_chat_completion(*args, **kwargs)
+                )  # type: ignore
             else:
                 return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
         except ImportError:
@@ -2096,7 +2108,10 @@ def __getstate__(self):
             logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
-            flash_attn=self.context_params.flash_attn,
+            flash_attn=(
+                self.context_params.flash_attn_type
+                == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            ),
             op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
             # Sampling Params
@@ -2318,7 +2333,11 @@ def from_pretrained(
         if additional_files:
             for additonal_file_name in additional_files:
                 # find the additional shard file:
-                matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
+                matching_additional_files = [
+                    file
+                    for file in file_list
+                    if fnmatch.fnmatch(file, additonal_file_name)
+                ]
 
                 if len(matching_additional_files) == 0:
                     raise ValueError(