Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.3.15]

- feat: Update llama.cpp to ggerganov/llama.cpp@9a96389544a08fd829fccda28142ce2066017fde
- feat: Add gpt-oss chat format support through strftime_now in chat format by @iamlemec in af637928db7351e030011085f818b034c6efc047
- fix: rename op_offloat to op_offload in llama.py by @sergey21000 in #2046

## [0.3.14]

- feat: Update llama.cpp to ggerganov/llama.cpp@79e0b68c178656bb0632cb8602d2940b755077f8
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
[![PyPI - Downloads](https://static.pepy.tech/badge/llama-cpp-python/month)](https://pepy.tech/projects/llama-cpp-python)
[![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()

Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
Expand Down
2 changes: 1 addition & 1 deletion llama_cpp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *

__version__ = "0.3.14"
__version__ = "0.3.15"
6 changes: 6 additions & 0 deletions llama_cpp/_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,18 +287,24 @@ def pooling_type(self) -> int:
return llama_cpp.llama_pooling_type(self.ctx)

def kv_cache_clear(self):
assert self.memory is not None, "Memory is not initialized"
llama_cpp.llama_memory_clear(self.memory, True)

def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
assert self.memory is not None, "Memory is not initialized"
seq_id = seq_id if seq_id >= 0 else 0
llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)

def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
assert self.memory is not None, "Memory is not initialized"
llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1)

def kv_cache_seq_keep(self, seq_id: int):
assert self.memory is not None, "Memory is not initialized"
llama_cpp.llama_memory_seq_keep(self.memory, seq_id)

def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
assert self.memory is not None, "Memory is not initialized"
llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift)

def get_state_size(self) -> int:
Expand Down
10 changes: 5 additions & 5 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def __init__(
embedding: bool = False,
offload_kqv: bool = True,
flash_attn: bool = False,
op_offloat: Optional[bool] = None,
op_offload: Optional[bool] = None,
swa_full: Optional[bool] = None,
# Sampling Params
no_perf: bool = False,
Expand Down Expand Up @@ -174,7 +174,7 @@ def __init__(
embedding: Embedding mode only.
offload_kqv: Offload K, Q, V to GPU.
flash_attn: Use flash attention.
op_offloat: offload host tensor operations to device
op_offload: offload host tensor operations to device
swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
no_perf: Measure performance timings.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
Expand Down Expand Up @@ -343,8 +343,8 @@ def __init__(
self.context_params.offload_kqv = offload_kqv
self.context_params.flash_attn = flash_attn

if op_offloat is not None:
self.context_params.op_offloat = op_offloat
if op_offload is not None:
self.context_params.op_offload = op_offload

if swa_full is not None:
self.context_params.swa_full = swa_full
Expand Down Expand Up @@ -2097,7 +2097,7 @@ def __getstate__(self):
embedding=self.context_params.embeddings,
offload_kqv=self.context_params.offload_kqv,
flash_attn=self.context_params.flash_attn,
op_offloat=self.context_params.op_offloat,
op_offload=self.context_params.op_offload,
swa_full=self.context_params.swa_full,
# Sampling Params
no_perf=self.context_params.no_perf,
Expand Down
6 changes: 6 additions & 0 deletions llama_cpp/llama_chat_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import random
import string

from datetime import datetime
from contextlib import ExitStack
from typing import (
Any,
Expand Down Expand Up @@ -214,6 +215,10 @@ def __init__(
lstrip_blocks=True,
).from_string(self.template)

@staticmethod
def strftime_now(f: str) -> str:
return datetime.now().strftime(f)

def __call__(
self,
*,
Expand All @@ -237,6 +242,7 @@ def raise_exception(message: str):
function_call=function_call,
tools=tools,
tool_choice=tool_choice,
strftime_now=self.strftime_now,
)

stopping_criteria = None
Expand Down
33 changes: 32 additions & 1 deletion llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@
# //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
# LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
#
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
# };
Expand Down Expand Up @@ -419,6 +420,7 @@
# LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38
LLAMA_FTYPE_GUESSED = 1024

# enum llama_rope_scaling_type {
Expand Down Expand Up @@ -691,6 +693,7 @@ class llama_model_kv_override(ctypes.Structure):
# bool use_mmap; // use mmap if possible
# bool use_mlock; // force system to keep model in RAM
# bool check_tensors; // validate model tensor data
# bool use_extra_bufts; // use extra buffer types (used for weight repacking)
# };
class llama_model_params(ctypes.Structure):
"""Parameters for llama_model
Expand All @@ -708,7 +711,8 @@ class llama_model_params(ctypes.Structure):
vocab_only (bool): only load the vocabulary, no weights
use_mmap (bool): use mmap if possible
use_mlock (bool): force system to keep model in RAM
check_tensors (bool): validate model tensor data"""
check_tensors (bool): validate model tensor data
use_extra_bufts (bool): use extra buffer types (used for weight repacking)"""

if TYPE_CHECKING:
devices: CtypesArray[ctypes.c_void_p] # NOTE: unused
Expand All @@ -724,6 +728,7 @@ class llama_model_params(ctypes.Structure):
use_mmap: bool
use_mlock: bool
check_tensors: bool
use_extra_bufts: bool

_fields_ = [
("devices", ctypes.c_void_p), # NOTE: unnused
Expand All @@ -739,6 +744,7 @@ class llama_model_params(ctypes.Structure):
("use_mmap", ctypes.c_bool),
("use_mlock", ctypes.c_bool),
("check_tensors", ctypes.c_bool),
("use_extra_bufts", ctypes.c_bool),
]


Expand Down Expand Up @@ -787,6 +793,9 @@ class llama_model_params(ctypes.Structure):
# bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
# bool kv_unified; // use a unified buffer across the input sequences when computing the attention
# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
# // ref: https://github.com/ggml-org/llama.cpp/pull/14363
# };
class llama_context_params(ctypes.Structure):
"""Parameters for llama_context
Expand Down Expand Up @@ -821,6 +830,7 @@ class llama_context_params(ctypes.Structure):
no_perf (bool): whether to measure performance timings
op_offload (bool): offload host tensor operations to device
swa_full (bool): use full-size SWA cache
kv_unified (bool): use a unified buffer across the input sequences when computing the attention
"""

if TYPE_CHECKING:
Expand Down Expand Up @@ -853,6 +863,7 @@ class llama_context_params(ctypes.Structure):
no_perf: bool
op_offload: bool
swa_full: bool
kv_unified: bool

_fields_ = [
("n_ctx", ctypes.c_uint32),
Expand Down Expand Up @@ -884,6 +895,7 @@ class llama_context_params(ctypes.Structure):
("no_perf", ctypes.c_bool),
("op_offload", ctypes.c_bool),
("swa_full", ctypes.c_bool),
("kv_unified", ctypes.c_bool),
]


Expand Down Expand Up @@ -1651,6 +1663,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
...


# // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
# LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
@ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool)
def llama_model_is_diffusion(model: llama_model_p, /) -> bool:
"""Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)"""
...


# // Returns 0 on success
# LLAMA_API uint32_t llama_model_quantize(
# const char * fname_inp,
Expand Down Expand Up @@ -2833,6 +2853,7 @@ def llama_synchronize(ctx: llama_context_p, /):
# // in the order they have appeared in the batch.
# // Rows: number of tokens for which llama_batch.logits[i] != 0
# // Cols: n_vocab
# // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
@ctypes_function(
"llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
Expand Down Expand Up @@ -2873,6 +2894,7 @@ def llama_get_logits_ith(
# // in the order they have appeared in the batch.
# // shape: [n_outputs*n_embd]
# // Otherwise, returns NULL.
# // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@ctypes_function(
"llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
Expand Down Expand Up @@ -3020,6 +3042,13 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
...


# LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
@ctypes_function("llama_vocab_mask", [llama_vocab_p_ctypes], llama_token)
def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token:
"""mask"""
...


# LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
@ctypes_function(
"llama_vocab_get_add_bos",
Expand Down Expand Up @@ -4176,6 +4205,7 @@ def llama_log_set(

# int32_t n_p_eval;
# int32_t n_eval;
# int32_t n_reused; // number of times a ggml compute graph had been reused
# };
class llama_perf_context_data(ctypes.Structure):
_fields_ = [
Expand All @@ -4185,6 +4215,7 @@ class llama_perf_context_data(ctypes.Structure):
("t_eval_ms", ctypes.c_double),
("n_p_eval", ctypes.c_int32),
("n_eval", ctypes.c_int32),
("n_reused", ctypes.c_int32),
]


Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]


Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp
Loading