MZWNET · pull · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.15]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@9a96389544a08fd829fccda28142ce2066017fde
+- feat: Add gpt-oss chat format support through strftime_now in chat format by @iamlemec in af637928db7351e030011085f818b034c6efc047
+- fix: rename op_offloat to op_offload in llama.py by @sergey21000 in #2046
+
 ## [0.3.14]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@79e0b68c178656bb0632cb8602d2940b755077f8

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
+[![PyPI - Downloads](https://static.pepy.tech/badge/llama-cpp-python/month)](https://pepy.tech/projects/llama-cpp-python)
 [![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()
 
 Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.

diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.14"
+__version__ = "0.3.15"
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -287,18 +287,24 @@ def pooling_type(self) -> int:
         return llama_cpp.llama_pooling_type(self.ctx)
 
     def kv_cache_clear(self):
+        assert self.memory is not None, "Memory is not initialized"
         llama_cpp.llama_memory_clear(self.memory, True)
 
     def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
+        assert self.memory is not None, "Memory is not initialized"
+        seq_id = seq_id if seq_id >= 0 else 0
         llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
 
     def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
+        assert self.memory is not None, "Memory is not initialized"
         llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1)
 
     def kv_cache_seq_keep(self, seq_id: int):
+        assert self.memory is not None, "Memory is not initialized"
         llama_cpp.llama_memory_seq_keep(self.memory, seq_id)
 
     def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
+        assert self.memory is not None, "Memory is not initialized"
         llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift)
 
     def get_state_size(self) -> int:

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -92,7 +92,7 @@ def __init__(
         embedding: bool = False,
         offload_kqv: bool = True,
         flash_attn: bool = False,
-        op_offloat: Optional[bool] = None,
+        op_offload: Optional[bool] = None,
         swa_full: Optional[bool] = None,
         # Sampling Params
         no_perf: bool = False,
@@ -174,7 +174,7 @@ def __init__(
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
             flash_attn: Use flash attention.
-            op_offloat: offload host tensor operations to device
+            op_offload: offload host tensor operations to device
             swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
             no_perf: Measure performance timings.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
@@ -343,8 +343,8 @@ def __init__(
         self.context_params.offload_kqv = offload_kqv
         self.context_params.flash_attn = flash_attn
 
-        if op_offloat is not None:
-            self.context_params.op_offloat = op_offloat
+        if op_offload is not None:
+            self.context_params.op_offload = op_offload
 
         if swa_full is not None:
             self.context_params.swa_full = swa_full
@@ -2097,7 +2097,7 @@ def __getstate__(self):
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
             flash_attn=self.context_params.flash_attn,
-            op_offloat=self.context_params.op_offloat,
+            op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
             # Sampling Params
             no_perf=self.context_params.no_perf,

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -8,6 +8,7 @@
 import random
 import string
 
+from datetime import datetime
 from contextlib import ExitStack
 from typing import (
     Any,
@@ -214,6 +215,10 @@ def __init__(
             lstrip_blocks=True,
         ).from_string(self.template)
 
+    @staticmethod
+    def strftime_now(f: str) -> str:
+        return datetime.now().strftime(f)
+
     def __call__(
         self,
         *,
@@ -237,6 +242,7 @@ def raise_exception(message: str):
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            strftime_now=self.strftime_now,
         )
 
         stopping_criteria = None

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -381,6 +381,7 @@
 #     //LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // removed from gguf files, use Q4_0 and runtime repack
 #     LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_MXFP4_MOE     = 38, // except 1d tensors
 #
 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@@ -419,6 +420,7 @@
 # LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35
 LLAMA_FTYPE_MOSTLY_TQ1_0 = 36
 LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
+LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38
 LLAMA_FTYPE_GUESSED = 1024
 
 # enum llama_rope_scaling_type {
@@ -691,6 +693,7 @@ class llama_model_kv_override(ctypes.Structure):
 #     bool use_mmap;      // use mmap if possible
 #     bool use_mlock;     // force system to keep model in RAM
 #     bool check_tensors; // validate model tensor data
+#     bool use_extra_bufts; // use extra buffer types (used for weight repacking)
 # };
 class llama_model_params(ctypes.Structure):
     """Parameters for llama_model
@@ -708,7 +711,8 @@ class llama_model_params(ctypes.Structure):
         vocab_only (bool): only load the vocabulary, no weights
         use_mmap (bool): use mmap if possible
         use_mlock (bool): force system to keep model in RAM
-        check_tensors (bool): validate model tensor data"""
+        check_tensors (bool): validate model tensor data
+        use_extra_bufts (bool): use extra buffer types (used for weight repacking)"""
 
     if TYPE_CHECKING:
         devices: CtypesArray[ctypes.c_void_p]  # NOTE: unused
@@ -724,6 +728,7 @@ class llama_model_params(ctypes.Structure):
         use_mmap: bool
         use_mlock: bool
         check_tensors: bool
+        use_extra_bufts: bool
 
     _fields_ = [
         ("devices", ctypes.c_void_p), # NOTE: unnused
@@ -739,6 +744,7 @@ class llama_model_params(ctypes.Structure):
         ("use_mmap", ctypes.c_bool),
         ("use_mlock", ctypes.c_bool),
         ("check_tensors", ctypes.c_bool),
+        ("use_extra_bufts", ctypes.c_bool),
     ]
 
 
@@ -787,6 +793,9 @@ class llama_model_params(ctypes.Structure):
 #     bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
 #                       // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
 #                       //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+#     bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
+#                       // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+#                       // ref: https://github.com/ggml-org/llama.cpp/pull/14363
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -821,6 +830,7 @@ class llama_context_params(ctypes.Structure):
         no_perf (bool): whether to measure performance timings
         op_offload (bool): offload host tensor operations to device
         swa_full (bool): use full-size SWA cache
+        kv_unified (bool): use a unified buffer across the input sequences when computing the attention
     """
 
     if TYPE_CHECKING:
@@ -853,6 +863,7 @@ class llama_context_params(ctypes.Structure):
         no_perf: bool
         op_offload: bool
         swa_full: bool
+        kv_unified: bool
 
     _fields_ = [
         ("n_ctx", ctypes.c_uint32),
@@ -884,6 +895,7 @@ class llama_context_params(ctypes.Structure):
         ("no_perf", ctypes.c_bool),
         ("op_offload", ctypes.c_bool),
         ("swa_full", ctypes.c_bool),
+        ("kv_unified", ctypes.c_bool),
     ]
 
 
@@ -1651,6 +1663,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
     ...
 
 
+# // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
+# LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
+@ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool)
+def llama_model_is_diffusion(model: llama_model_p, /) -> bool:
+    """Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)"""
+    ...
+
+
 # // Returns 0 on success
 # LLAMA_API uint32_t llama_model_quantize(
 #         const char * fname_inp,
@@ -2833,6 +2853,7 @@ def llama_synchronize(ctx: llama_context_p, /):
 # // in the order they have appeared in the batch.
 # // Rows: number of tokens for which llama_batch.logits[i] != 0
 # // Cols: n_vocab
+# // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
 # LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 @ctypes_function(
     "llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@@ -2873,6 +2894,7 @@ def llama_get_logits_ith(
 # // in the order they have appeared in the batch.
 # // shape: [n_outputs*n_embd]
 # // Otherwise, returns NULL.
+# // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
 # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 @ctypes_function(
     "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@@ -3020,6 +3042,13 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
 
+# LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
+@ctypes_function("llama_vocab_mask", [llama_vocab_p_ctypes], llama_token)
+def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token:
+    """mask"""
+    ...
+
+
 # LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
 @ctypes_function(
     "llama_vocab_get_add_bos",
@@ -4176,6 +4205,7 @@ def llama_log_set(
 
 #     int32_t n_p_eval;
 #     int32_t n_eval;
+#     int32_t n_reused; // number of times a ggml compute graph had been reused
 # };
 class llama_perf_context_data(ctypes.Structure):
     _fields_ = [
@@ -4185,6 +4215,7 @@ class llama_perf_context_data(ctypes.Structure):
         ("t_eval_ms", ctypes.c_double),
         ("n_p_eval", ctypes.c_int32),
         ("n_eval", ctypes.c_int32),
+        ("n_reused", ctypes.c_int32),
     ]
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
 ]
 
 

diff --git a/vendor/llama.cpp b/vendor/llama.cpp